diff --git a/SCMG/__pycache__/_version.cpython-310.pyc b/SCMG/__pycache__/_version.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd53d1cdbb86b3f20d7da45154c1be8b81a09edc
Binary files /dev/null and b/SCMG/__pycache__/_version.cpython-310.pyc differ
diff --git a/SCMG/_version.py b/SCMG/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..59a04da4d4c3180edab3d852e5d834c5169849da
--- /dev/null
+++ b/SCMG/_version.py
@@ -0,0 +1,2 @@
+def get_versions():
+    version = "0.1.1"
diff --git a/SCMG/config/__init__.py b/SCMG/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/config/__pycache__/__init__.cpython-310.pyc b/SCMG/config/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..599d93e1d892f0000e63bc183b8371cde3beb7bb
Binary files /dev/null and b/SCMG/config/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/config/__pycache__/modelparameters.cpython-310.pyc b/SCMG/config/__pycache__/modelparameters.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a5fcd77f31050c7491ccecc6fc15f26d47b2e8
Binary files /dev/null and b/SCMG/config/__pycache__/modelparameters.cpython-310.pyc differ
diff --git a/SCMG/config/__pycache__/varables.cpython-310.pyc b/SCMG/config/__pycache__/varables.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9924e92650a1390128fea2c726e4a0e5192a5689
Binary files /dev/null and b/SCMG/config/__pycache__/varables.cpython-310.pyc differ
diff --git a/SCMG/config/modelparameters.py b/SCMG/config/modelparameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..1870a10a1615761eeb9d84bdad5dccfb2e2eac92
--- /dev/null
+++ b/SCMG/config/modelparameters.py
@@ -0,0 +1,21 @@
+# class ModelParameters():
+#     def __init__(self):
+#         self.NUM_LAYERS = "num_layers"
+#         self.NUM_HEADS = "num_heads"
+#         self.DIM_ATTENTION = "dim_attention"
+#         self.DIM_FEEDFORWARD = "dim_feedforward"
+#         self.DIM_LSTM = "dim_lstm"
+#         self.DIM_EMBEDDING = "dim_embedding"
+#         self.DIM_OUTPUT = "dim_output"
+#         self.RATE_DROPOUT = "rate_dropout"
+#         return
+#
+
+NUM_LAYERS = "num_layers"
+NUM_HEADS = "num_heads"
+DIM_ATTENTION = "dim_attention"
+DIM_FEEDFORWARD = "dim_feedforward"
+DIM_LSTM = "dim_lstm"
+DIM_EMBEDDING = "dim_embedding"
+DIM_OUTPUT = "dim_output"
+RATE_DROPOUT = "rate_dropout"
diff --git a/SCMG/config/varables.py b/SCMG/config/varables.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc89401a9fc007b148cf6c64e2d0191c2ff02ba9
--- /dev/null
+++ b/SCMG/config/varables.py
@@ -0,0 +1,234 @@
+import re
+from rdkit import Chem
+
+DEFAULT = "default"
+AUTO = "auto"
+
+# Variables
+COLUMN_SMILES = "SMILES"
+COLUMN_ENCODER = "Encoder"
+COLUMN_DECODER = "Decoder"
+COLUMN_TASK_TYPE = "TaskType"
+COLUMN_ENCODER_SEQUENCE = "EncoderSequence"
+COLUMN_DECODER_SEQUENCE = "DecoderSequence"
+COLUMN_BOS_TOKEN = "TokenBOS"
+COLUMN_CUTS = "Cuts"
+COLUMN_MIN_TOP_P = "MinTopP"
+COLUMN_MIN_TOKEN_PROB = "MinTokenProb"
+COLUMN_TOKEN_EOS_PROB = "TokenEOSProb"
+COLUMN_MOLNAME = "MolName"
+COLUMN_MOLINDEX = "MolIndex"
+COLUMN_MOL_PROB = "MolProb"
+COLUMN_MOL_PROB_TOPP = "MolProb_TopP"
+
+# Task
+TOKEN_BEGIN = "<bos>"
+TOKEN_END = "<eos>"
+TOKEN_SEP = "<sep>"
+TOKEN_CODER_SEP = "<delim>"
+# TRAIN = "Train"
+TOKEN_PAD = "<pad>"
+COLUMN_EXCLUDED_MIN = "ExcludedSize"
+COLUMN_SIZE_ToRunForNExt = "ExcludedSize"
+COLUMN_SIZE_EXCLUDED = "ExcludedSize"
+
+# char_level_molecule_generation
+COLUMN_task_char_mg = "char_mg"
+TOKEN_TASK_CHAR_MG = "<char_mg>"
+
+# char_level_scaffold_constrained_molecule_generation
+COLUMN_task_char_scmg = "char_scmg"
+TOKEN_TASK_SCMG_CHAR_RAND = "<scmg_char_rand>"
+TOKEN_TASK_SCMG_CHAR_CANO = "<scmg_char_cano>"
+TOKEN_TASK_DG_CHAR_RAND = "<dg_char_rand>"
+TOKEN_TASK_DG_CHAR_CANO = "<dg_char_cano>"
+LIST_HEAVY_ATOMS = ['c', 'C', 'O', 'N', 'n', 'F', '[C@H]', 'Cl', '[C@@H]', 'S', '[nH]', 's', 'o', 'Br', '[C@]', '[C@@]', 'P', 'B', '[N+]', '[P@@]', '[P@]', '[S@@]', '[N@+]', '[S@]', '[N@@+]', '[N-]', 'p']
+COLUMN_EXCLUDE_REASON = "Excluded"
+COLUMN_STATE = "State"
+# chemical_property_prediction
+COLUMN_task_chem_pd = "chem_pd"
+TOKEN_TASK_CHEM_PD = "<chem_pd>"
+
+# molecule_identification
+COLUMN_task_mol_id = "mol_id"
+TOKEN_TASK_MOL_ID = "<mol_id>"
+
+
+
+FILEPATH_MODEL = "filepath_model"
+FILEPATH_INPUT = "filepath_input"
+DIRPATH_OUTPUT = "dirpath_output"
+RANDOM_AUGUMENT = "random_augument"
+TOP_P = "top_p"
+TOP_K = "top_k"
+MIN_MOL_PROB = "minimum_mol_prob"
+MIN_TOKEN_PROB = "minimum_token_prob"
+MAX_HEAVY_ATOMS = "maximum_heavy_atoms"
+TEMPERATURE = "temperature"
+
+# Data
+VOCAB = "vocab"
+SIZE_VOCAB = "size_vocab"
+FILENAME_VOCAB = "vocab.pt"
+FILENAME_VOCABSTATE = "vocabstate.pt"
+FILENAME_DATA_RAW = "data.csv"
+
+TRAIN = "train"
+TEST = "test"
+FILENAME_TRAIN_RAW = "train.pt"
+FILENAME_TRAIN_EPOCH = lambda x: "train_"+str(x)+".pt"
+
+FILENAME_TEST = "test.pt"
+FILENAME_TEST_RAW = "test.pt"
+FILENAME_TEST_EPOCH = lambda x: "test_"+str(x)+".pt"
+FILEPATH_VOCAB = "filepath_vocab"
+#
+# try:
+#     config.screen_width = os.get_terminal_size()[0]
+# except:
+#     config.screen_width = 141
+MAX_SEQUENCE_LENGTH = "max_sequence_length"
+COLUMN_INCHIKEY = "InchiKey"
+# Train
+MODEL_NAME = "model_name"
+MODEL_TYPE = "model_type"
+MODEL = "model"
+TASKS = "tasks"
+DIRPATH_CHECKPOINT = "dirpath_checkpoint"
+DIRPATH_DATA = "dirpath_data"
+SIZE_BATCH = "size_batch"
+SIZE_BLOCK = "size_block"
+RATE_LEARNING = "rate_learning"
+DEVICE = "device"
+EPOCH           = "epoch"
+EPOCHS           = "epochs"
+NUM_WORKERS = "num_workers"
+DIRPATH_COMPLETED = "dirpath_completed"
+DIRPATH_EXCLUDED = "dirpath_excluded"
+DIRPATH_SBATCH = "dirpath_sbatch"
+
+# Stats
+TRAIN_LOSS      = "train_loss"
+TEST_LOSS       = "test_loss"
+TIME_ELAPSED    = "time_elapsed"
+RATE_LEARNING   = "rate_learning"
+TOKENS          = "tokens"
+
+# Model
+FILENAME_MODEL_INIT = "model_init.pt"
+FILENAME_MODEL_LATEST = "model.pt"
+FILENAME_MODEL_TRAINED = lambda x: "model_"+str(x)+".pt"
+
+FILENAME_MODELSTATE_INIT = "modelstate_init.pt"
+FILENAME_MODELSTATE_LATEST = "modelstate.pt"
+FILENAME_MODELSTATE_TRAINED = lambda x: "modelstate_"+str(x)+".pt"
+
+FILENAME_SCHEDULER_INIT = "scheduler_init.pt"
+FILENAME_SCHEDULER_LATEST = "scheduler.pt"
+FILENAME_SCHEDULER_TRAINED = lambda x: "scheduler_"+str(x)+".pt"
+
+FILENAME_OPTIMIZER_INIT = "optimizer_init.pt"
+FILENAME_OPTIMIZER_LATEST = "optimizer.pt"
+FILENAME_OPTIMIZER_TRAINED = lambda x: "optimizer_"+str(x)+".pt"
+
+# FILENAME_TRAINLOG_INIT = "train_init.pt"
+FILENAME_TRAINSTATS_LATEST = "trainstats_latest.csv"
+FILENAME_TRAINSTATS_TRAINED = lambda x: "trainstats_"+str(x)+".csv"
+
+FILENAME_TRAINLOG = "train"
+FORMAT_TIMESTAMP_FILEHANDLER = "%Y%m%d%H%M%S_%f.log"
+FORMAT_TIMESTAMP = "%Y/%m/%d %H:%M:%S %f"
+
+FORMAT_LOG = ""
+DRY_RUN = "dry_run"
+LOG_LEVEL = "log_level"
+TOKENIZER = "tokenizer"
+RUN_ONE_EPOCH = "run_one_epoch"
+# # Column names
+# IS_NOVEL = "IS_NOVAL"
+# NOVALTY = "Novalty"
+# # VALIDITY = "Validity"
+# IS_VALID = "IS_VALID"
+# IS_NOVAL = "IS_NOVAL"
+# DIR_SAVE = "dir_save"
+# MODEL_LATEST = "model.pt"
+# LOG_TRAIN_LATEST = "train_log.csv"
+# OPTIMIZER_LATEST = "optimizer.pt"
+# SCHEDULER_LATEST = "scheduler.pt"
+# TRAIN_LOSS      = "train_loss"
+# TEST_LOSS       = "test_loss"
+# TIME_ELAPSED    = "time_elapsed"
+# # LR              = "lr"
+# TOKENS          = "tokens"
+
+LOGP = "logP"
+WEIGHT = "weight"
+QED = "QED"
+VALIDITY = "SMILES_VALID"
+FILENAME_TRAIN_DIST = "train_dist.pt"
+FILENAME_TEST_DIST = "test_dist.pt"
+MODEL_PRETRAIN = "model_pretrained.pt"
+
+PYFILE_SAMPLER = "sampler.py"
+PYFILE_TRAINER = "trainer.py"
+PYFILE_DATALOADER = "dataloader.py"
+# PYFILE_SAMPLER = "sampler.py"
+
+
+
+
+# Model parameters
+NUM_LAYERS = "num_layers"
+NUM_ENCODER_LAYERS = "num_encoder_layers"
+NUM_DECODER_LAYERS = "num_decoder_layers"
+NUM_HEADS = "num_heads"
+DIM_ATTENTION = "dim_attention"
+DIM_FEEDFORWARD = "dim_feedforward"
+DIM_LSTM = "dim_lstm"
+DIM_EMBEDDING = "dim_embedding"
+DIM_OUTPUT = "dim_output"
+RATE_DROPOUT = "rate_dropout"
+
+
+
+
+#Scheduler
+SIZE_STEP = "size_step"
+GAMMA = "gamma"
+
+
+
+
+
+
+
+
+# From Reinvent-Scaffold-Decorator
+ATTACHMENT_POINT_TOKEN = "*"
+ATTACHMENT_POINT_NUM_REGEXP = r"\[{}:(\d+)\]".format(re.escape(ATTACHMENT_POINT_TOKEN))
+ATTACHMENT_POINT_REGEXP = r"(?:{0}|\[{0}[^\]]*\])".format(re.escape(ATTACHMENT_POINT_TOKEN))
+ATTACHMENT_POINT_NO_BRACKETS_REGEXP = r"(?<!\[){}".format(re.escape(ATTACHMENT_POINT_TOKEN))
+
+ATTACHMENT_SEPARATOR_TOKEN = "|"
+
+SLICE_SMARTS = {
+    "hr": [
+        "[*]!@-[*]"
+    ],
+    "recap": [
+        "[C;$(C=O)]!@-N",  # amides and urea
+        "[C;$(C=O)]!@-O",  # esters
+        "C!@-[N;!$(NC=O)]",  # amines
+        "C!@-[O;!$(NC=O)]",  # ether
+        "[CX3]!@=[CX3]",  # olefin
+        "[N+X4]!@-C",  # quaternary nitrogen
+        "n!@-C",  # aromatic N - aliphatic C
+        "[$([NR][CR]=O)]!@-C",  # lactam nitrogen - aliphatic carbon
+        "c!@-c",  # aromatic C - aromatic C
+        "N!@-[$(S(=O)=O)]"  # sulphonamides
+    ]
+}
+SLICE_SMARTS = {name: [Chem.MolFromSmarts(sma) for sma in smarts] for name, smarts in SLICE_SMARTS.items()}
+
+
+
diff --git a/SCMG/models/GPT/__init__.py b/SCMG/models/GPT/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/GPT/__pycache__/__init__.cpython-310.pyc b/SCMG/models/GPT/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e15578baa7533a72ba2755b3429e3f75ca7c9e2
Binary files /dev/null and b/SCMG/models/GPT/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/GPT/__pycache__/model.cpython-310.pyc b/SCMG/models/GPT/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32b97cc5d9d19f2098e6c1a02515da4b3391b783
Binary files /dev/null and b/SCMG/models/GPT/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/GPT/__pycache__/sampler.cpython-310.pyc b/SCMG/models/GPT/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4573771b734ae0805f71c7771df4d90027329aa8
Binary files /dev/null and b/SCMG/models/GPT/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/GPT/model.py b/SCMG/models/GPT/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a51a2df5538100c269f9a7538ca5e46bfb5e85c
--- /dev/null
+++ b/SCMG/models/GPT/model.py
@@ -0,0 +1,197 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Decoder,Mask_Decoder):
+        X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.Dropout2(X_Decoder + self.FeedForward    (self.LayerNorm2(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Decoder
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Decoder = self.generate_masks(X_Decoder)
+        # preprocess
+        X_Decoder = self.Dropout1(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Decoder,Mask_Decoder)
+        X_Decoder = self.LayerNorm1(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
\ No newline at end of file
diff --git a/SCMG/models/GPT/sampler.py b/SCMG/models/GPT/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/GPT/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/GPT2/__init__.py b/SCMG/models/GPT2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/GPT2/__pycache__/__init__.cpython-310.pyc b/SCMG/models/GPT2/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e8f67f4fd9d403ef89903b3eaf33ff6f5a62ad
Binary files /dev/null and b/SCMG/models/GPT2/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/GPT2/__pycache__/model.cpython-310.pyc b/SCMG/models/GPT2/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b2351be172e79b65b1d42bdd047478a6b150e81
Binary files /dev/null and b/SCMG/models/GPT2/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/GPT2/__pycache__/sampler.cpython-310.pyc b/SCMG/models/GPT2/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c44a897fb3271ec4e35a8882d260094352c94441
Binary files /dev/null and b/SCMG/models/GPT2/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/GPT2/model.py b/SCMG/models/GPT2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1822fbd0fd2f6e0286723b95567d1630bc5f2
--- /dev/null
+++ b/SCMG/models/GPT2/model.py
@@ -0,0 +1,197 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Decoder,Mask_Decoder):
+        X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.Dropout2(X_Decoder + self.FeedForward    (self.LayerNorm2(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_DECODER_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Decoder
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Decoder = self.generate_masks(X_Decoder)
+        # preprocess
+        X_Decoder = self.Dropout1(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Decoder,Mask_Decoder)
+        X_Decoder = self.LayerNorm1(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
\ No newline at end of file
diff --git a/SCMG/models/GPT2/sampler.py b/SCMG/models/GPT2/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/GPT2/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/LSTM/__init__.py b/SCMG/models/LSTM/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/LSTM/__pycache__/__init__.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a80648fac63e5677b96d14154c9254b403facf09
Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/LSTM/__pycache__/model.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b04314d86110100a1cf6f3b6305bfa3ac129ea4
Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/LSTM/__pycache__/sampler.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d989a0b8bf05a57af3c6b8aa83b8261fc7996fd
Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/LSTM/__pycache__/trainer.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/trainer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ce864c1d5f0af498e375c5f5f29ebd674896f5e
Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/trainer.cpython-310.pyc differ
diff --git a/SCMG/models/LSTM/model.py b/SCMG/models/LSTM/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f0e44f03505d9bb05380a3e0014ba13e3b9f22
--- /dev/null
+++ b/SCMG/models/LSTM/model.py
@@ -0,0 +1,48 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.rnn as rnn_utils
+from SCMG.config import varables
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vocab = config["vocab_encoder"]
+        # self.vocabulary = vocabulary
+        # self.hidden_size = config.hidden
+        # self.num_layers = config.num_layers
+        # self.dropout = config.dropout
+        # self.vocab_size = self.input_size = self.output_size = len(vocabulary)
+        self.embedding_layer = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.lstm_layer = nn.LSTM(config[varables.DIM_EMBEDDING], config[varables.DIM_LSTM],
+                                  config[varables.NUM_LAYERS], dropout=config[varables.RATE_DROPOUT],
+                                  batch_first=True)
+        self.linear_layer = nn.Linear(config[varables.DIM_LSTM], len(config["vocab_encoder"]))
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = None
+            y_in = [a[0] + [vocab_encoder[varables.TOKEN_SEP]] + a[1] for a in results]
+            # boundary = [a[2] for a in results]
+            max_len = max([len(a) for a in y_in])
+            y = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len-len(a))) for a in y_in],dtype=torch.long)
+            return x_in,y,0
+        return collate
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def forward(self, src, trg, trg_out, boundary=None):
+        # x = ([src , torch.tensor([self.vocab["<sep>"]]*x.size[0]).unsqueeze(1).to(x.device), trg],dim=1)
+        hiddens=None
+        x = self.embedding_layer(trg)
+        # x = rnn_utils.pack_padded_sequence(x, lengths, batch_first=True)
+        self.lstm_layer.flatten_parameters()
+        x, hiddens = self.lstm_layer(x, hiddens)
+        # x, _ = rnn_utils.pad_packed_sequence(x, batch_first=True)
+        logits = self.linear_layer(x)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
diff --git a/SCMG/models/LSTM/sampler.py b/SCMG/models/LSTM/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..40ab9635c922fd242699930d44a623161f732ac8
--- /dev/null
+++ b/SCMG/models/LSTM/sampler.py
@@ -0,0 +1,20 @@
+from MoleculeProcessing.utils.utils        import *
+from MoleculeProcessing.utils.utils_sample  import *
+import torch.nn.functional as F
+
+def sample(model,vocab_bos,size_batch=32,size_block=70,temperature=1.,):
+    model,device = load_to_device(model)
+    model.eval()
+    with torch.no_grad():
+        tensor_sampled = torch.zeros(size_batch,size_block+1,dtype=torch.long,device=device)
+        tensor_sampled[:,0] = vocab_bos
+        hiddens = None
+        for i in range(size_block):
+            input_current = tensor_sampled[:,[i]]
+            probs,hiddens = model.forward(input_current,hiddens)
+            probs = probs[:,-1]
+            probs = probs * temperature
+            probs = F.softmax(probs,dim=-1)
+            sample = torch.distributions.categorical.Categorical(probs).sample()
+            tensor_sampled[:,i+1] = sample
+        return tensor_sampled
diff --git a/SCMG/models/LSTM/trainer.py b/SCMG/models/LSTM/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..969b615c7d42c01d2e13ff3e7ab9e17e7b953daf
--- /dev/null
+++ b/SCMG/models/LSTM/trainer.py
@@ -0,0 +1,195 @@
+import math
+import logging
+import time
+from tqdm import tqdm
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.optim.lr_scheduler import LambdaLR
+from torch.utils.data.dataloader import DataLoader
+from MoleculeProcessing.utils.utils_train import *
+logger = logging.getLogger(__name__)
+from MoleculeProcessing.utils.utils        import *
+from MoleculeProcessing.utils.utils_train  import *
+from MoleculeProcessing.config.config      import *
+
+class TrainerConfig:
+    learning_rate = 3e-4
+    betas = (0.9, 0.95)
+    grad_norm_clip = 1.0
+    weight_decay = 0.1
+    lr_decay = False
+    warmup_tokens = 375e6
+    final_tokens = 260e9
+    ckpt_path = None
+    num_workers = 0
+    config = None
+    epoch = 0
+
+    def __init__(self, **kwargs):
+        for k,v in kwargs.items():
+            setattr(self, k, v)
+
+class Trainer:
+    def __init__(self, model, train_dataset, test_dataset, config):
+        self.model = model
+        self.train_dataset = train_dataset
+        self.test_dataset = test_dataset
+        self.config = config
+        # continue train if previous model exists
+        self.train_log = init_train_log()
+        if os.path.exists(os.path.join(self.config.config.path_checkpoint,LOG_TRAIN_LATEST)):
+            self.train_log = pd.read_csv(os.path.join(self.config.config.path_checkpoint,LOG_TRAIN_LATEST))
+        self.config.epoch = self.train_log.shape[0]
+        if self.train_log.shape[0]>0:
+            self.model      = load_model(    self.config.config.path_checkpoint,self.config.epoch-1)
+            self.optimizer  = load_optimizer(self.config.config.path_checkpoint,self.config.epoch-1)
+            self.tokens     = self.train_log.loc[self.config.epoch-1,TOKENS]
+            self.scheduler  = load_scheduler(self.config.config.path_checkpoint,self.config.epoch-1)
+        else:
+            self.tokens = 0 # counter used for learning rate decay
+            self.optimizer = model.configure_optimizers(config)
+            self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
+                10,
+                0.5)
+        self.criterion = nn.CrossEntropyLoss()
+        # take over whatever gpus are on the system
+        self.device = 'cpu'
+        if torch.cuda.is_available():
+            self.device = torch.cuda.current_device()
+            self.model = torch.nn.DataParallel(self.model).to(self.device)
+
+    def save_checkpoint(self):
+        path_checkpoint = self.config.config.path_checkpoint
+        # DataParallel wrappers keep raw model object in .module attribute
+        raw_model = self.model.module if hasattr(self.model, "module") else self.model
+        logger.info("saving %s", path_checkpoint)
+        path_model_epoch = add_before_extension(os.path.join(path_checkpoint,
+                MODEL_LATEST),
+            str(self.config.epoch))
+        torch.save(raw_model, path_model_epoch)
+        # optimizer
+        path_optimizer_epoch = \
+            add_before_extension(
+                os.path.join(
+                    path_checkpoint,
+                    OPTIMIZER_LATEST
+                    ),
+                    str(self.config.epoch)
+                    )
+        torch.save(
+            self.optimizer,
+            path_optimizer_epoch
+            )
+        # optimizer
+        path_scheduler_epoch = \
+            add_before_extension(
+                os.path.join(
+                    path_checkpoint,
+                    SCHEDULER_LATEST
+                    ),
+                    str(self.config.epoch)
+                    )
+        torch.save(
+            self.scheduler,
+            path_scheduler_epoch
+            )
+        # train log
+        self.train_log.to_csv(
+            os.path.join(
+                path_checkpoint,
+                LOG_TRAIN_LATEST
+                )
+            ,index=False
+            )
+        path_train_log_epoch = \
+            add_before_extension(
+                os.path.join(
+                    path_checkpoint,
+                    LOG_TRAIN_LATEST
+                    ),
+                str(self.config.epoch)
+                )
+        self.train_log.to_csv(
+            path_train_log_epoch,
+            index=False)
+
+        # torch.save(self.token,os.path.join(path_checkpoint,'tokens_'+self.config.epoch+'.pt'))
+    def train(self):
+        model, config = self.model, self.config
+        raw_model = model.module if hasattr(self.model, "module") else model
+        optimizer = self.optimizer
+        scheduler = self.scheduler
+        while self.config.epoch < config.config.epochs and self.config.epoch != config.config.epochs:
+            current_status = dict([[a,None] for a in self.train_log.columns])
+            current_status[EPOCH] = self.config.epoch
+            time_start = time.time()
+            current_status = self.run_epoch('train',current_status)
+            current_status[TIME_ELAPSED] = int(time.time()-time_start)
+            current_status[TOKENS] = self.tokens
+            if self.test_dataset is not None:
+                current_status = self.run_epoch('test',current_status)
+            self.train_log.loc[self.config.epoch] = current_status
+            scheduler.step()
+            self.save_checkpoint()
+            self.config.epoch += 1
+
+    def run_epoch(self,split,current_status):
+        model = self.model
+        is_train = split == 'train'
+        model.train(is_train)
+        data = self.train_dataset if is_train else self.test_dataset
+        data.shuffle(random_state=self.config.epoch)
+        loader = DataLoader(data, shuffle=False, pin_memory=True,
+                            batch_size=self.config.config.size_batch,
+                            num_workers=self.config.num_workers)
+
+        losses = []
+        pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
+        for it, (x, y) in pbar:
+
+            # place data on the correct device
+            x = x.to(self.device)
+            y = y.to(self.device)
+
+            # forward the model
+            with torch.set_grad_enabled(is_train):
+                outputs,_ = model.forward(x)
+                loss = self.criterion(outputs.view(-1, outputs.shape[-1]),
+                                 y.view(-1))
+                loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
+                losses.append(loss.item())
+
+            if is_train:
+                # backprop and update the parameters
+                model.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.grad_norm_clip)
+                self.optimizer.step()
+
+                # decay the learning rate based on our progress
+                if self.config.lr_decay:
+                    self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
+                    if self.tokens < self.config.warmup_tokens:
+                        # linear warmup
+                        lr_mult = float(self.tokens) / float(max(1, self.config.warmup_tokens))
+                    else:
+                        # cosine learning rate decay
+                        progress = float(self.tokens - self.config.warmup_tokens) / float(max(1, self.config.final_tokens - self.config.warmup_tokens))
+                        lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
+                    lr = self.config.learning_rate * lr_mult
+                    for param_group in optimizer.param_groups:
+                        param_group['lr'] = lr
+                else:
+                    lr = self.config.learning_rate
+                current_status[LR] = lr
+
+                # report progress
+                pbar.set_description(f"epoch {self.config.epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
+        current_status[split+'_loss'] = float(np.mean(losses))
+        if not is_train:
+            test_loss = float(np.mean(losses))
+            logger.info("test loss: %f", test_loss)
+        return current_status
diff --git a/SCMG/models/Reinvent/__init__.py b/SCMG/models/Reinvent/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Reinvent/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..483fae766088dcefd1a42be42466d42476c444a2
Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9f739287942911ec6b727feab0c6fe140f4389d
Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14ced2341761ac673ecd104e084501789816f0b4
Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent/__pycache__/model.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42270dbd5e48f2d2ec2ddc1df7c3fdd8e7048ff8
Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46c9961fcc26d8f0f92e368f3496b1f8319618a1
Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent/model copy 2.py b/SCMG/models/Reinvent/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Reinvent/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Reinvent/model copy.py b/SCMG/models/Reinvent/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Reinvent/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Reinvent/model.py b/SCMG/models/Reinvent/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d93c209eece94f1cbce12f9907ce3edf18017f6f
--- /dev/null
+++ b/SCMG/models/Reinvent/model.py
@@ -0,0 +1,278 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+# class PositionalEncoder(nn.Module):
+#     def __init__(self, config):
+#         super().__init__()
+#         pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION])
+#         for pos in range(config[varables.SIZE_BLOCK]):
+#             for i in range(0, config[varables.DIM_ATTENTION], 2):
+#                 pe[pos, i] = \
+#                 math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION])))
+#                 pe[pos, i + 1] = \
+#                 math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION])))                
+#         pe = pe.unsqueeze(0)
+#         self.register_buffer('pe', pe)
+#     def forward(self, T):
+#         #add constant to embedding
+#         x = Variable(self.pe[:,:T], requires_grad=False)
+#         return x
+
+
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.Dropout1(X_Encoder + self.Attention  (self.LayerNorm1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.Dropout2(X_Encoder + self.FeedForward(self.LayerNorm2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross (                X_Encoder,  self.LayerNorm2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.Dropout3(X_Decoder + self.FeedForward    (self.LayerNorm3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+
+
+
+
+
+
+
+
+
+
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Reinvent/sampler.py b/SCMG/models/Reinvent/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Reinvent/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__init__.py b/SCMG/models/Reinvent_Scaffold_Decorator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cdc0c50cde1ad2794fe7ccb4470ee6560a2fff0
Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e507145c9134a603126084bf93df297b3de251f
Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a1741ccf7288d663bb655dfe90a6a08ac942d8d
Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d4e71a297c36745f63960a12870eed7f0cf8b1e
Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/model copy 2.py b/SCMG/models/Reinvent_Scaffold_Decorator/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Reinvent_Scaffold_Decorator/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/model copy.py b/SCMG/models/Reinvent_Scaffold_Decorator/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Reinvent_Scaffold_Decorator/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/model.py b/SCMG/models/Reinvent_Scaffold_Decorator/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2efedb5615ab35061aa476a38704b4c5826e3cfe
--- /dev/null
+++ b/SCMG/models/Reinvent_Scaffold_Decorator/model.py
@@ -0,0 +1,276 @@
+
+Skip to content
+
+    Why GitHub? 
+
+Team
+Enterprise
+Explore
+Marketplace
+Pricing
+
+Sign in
+Sign up
+undeadpixel /
+reinvent-scaffold-decorator
+Public
+
+Code
+Issues 3
+Pull requests
+Actions
+Projects
+Wiki
+Security
+
+    Insights
+
+reinvent-scaffold-decorator/models/model.py /
+Arús-Pous, Josep updated to revised version
+Latest commit 37d0a8a on May 8, 2020
+History
+0 contributors
+136 lines (118 sloc) 5.75 KB
+"""
+Model class.
+"""
+
+import torch
+import torch.nn as tnn
+
+import models.decorator as mdec
+
+
+class DecoratorModel:
+
+    def __init__(self, vocabulary, decorator, max_sequence_length=256, no_cuda=False, mode="train"):
+        """
+        Implements the likelihood and sampling functions of the decorator model.
+        :param vocabulary: A DecoratorVocabulary instance with the vocabularies of both the encoder and decoder.
+        :param network_params: A dict with parameters for the encoder and decoder networks.
+        :param decorator: An decorator network instance.
+        :param max_sequence_length: Maximium number of tokens allowed to sample.
+        :param no_cuda: Forces the model not to use CUDA, even if it is available.
+        :param mode: Mode in which the model should be initialized.
+        :return:
+        """
+        self.vocabulary = vocabulary
+        self.max_sequence_length = max_sequence_length
+        self.network = decorator
+
+        if torch.cuda.is_available() and not no_cuda:
+            self.network.cuda()
+
+        self._nll_loss = tnn.NLLLoss(reduction="none", ignore_index=0)
+        self.set_mode(mode)
+
+    @classmethod
+    def load_from_file(cls, path, mode="train"):
+        """
+        Loads a model from a single file
+        :param path: Path to the saved model.
+        :param mode: Mode in which the model should be initialized.
+        :return: An instance of the RNN.
+        """
+        data = torch.load(path)
+
+        decorator = mdec.Decorator(**data["decorator"]["params"])
+        decorator.load_state_dict(data["decorator"]["state"])
+
+        model = DecoratorModel(
+            decorator=decorator,
+            mode=mode,
+            **data["model"]
+        )
+
+        return model
+
+    def save(self, path):
+        """
+        Saves the model to a file.
+        :param path: Path to the file which the model will be saved to.
+        """
+        save_dict = {
+            'model': {
+                'vocabulary': self.vocabulary,
+                'max_sequence_length': self.max_sequence_length
+            },
+            'decorator': {
+                'params': self.network.get_params(),
+                'state': self.network.state_dict()
+            }
+        }
+        torch.save(save_dict, path)
+
+    def set_mode(self, mode):
+        """
+        Changes the mode of the RNN to training or eval.
+        :param mode: Mode to change to (training, eval)
+        :return: The model instance.
+        """
+        if mode == "sampling" or mode == "eval":
+            self.network.eval()
+        else:
+            self.network.train()
+        return self
+
+    def likelihood(self, scaffold_seqs, scaffold_seq_lengths, decoration_seqs, decoration_seq_lengths, with_attention_weights=False):
+        """
+        Retrieves the likelihood of a scaffold and its respective decorations.
+        :param scaffold_seqs: (batch, seq) A batch of padded scaffold sequences.
+        :param scaffold_seq_lengths: The length of the scaffold sequences (for packing purposes).
+        :param decoration_seqs: (batch, seq) A batch of decorator sequences.
+        :param decoration_seq_lengths: The length of the decorator sequences (for packing purposes).
+        :return:  (batch) Log likelihood for each item in the batch.
+        """
+
+        # NOTE: the decoration_seq_lengths have a - 1 to prevent the end token to be forward-passed.
+        logits, attention_weights = self.network(scaffold_seqs, scaffold_seq_lengths, decoration_seqs,
+                                                 decoration_seq_lengths - 1)  # (batch, seq - 1, voc)
+        log_probs = logits.log_softmax(dim=2).transpose(1, 2)  # (batch, voc, seq - 1)
+
+        logits = self._nll_loss(log_probs, decoration_seqs[:, 1:]).sum(dim=1)  # (batch)
+        if with_attention_weights:
+            return logits, attention_weights
+        else:
+            return logits
+
+    @torch.no_grad()
+    def sample_decorations(self, scaffold_seqs, scaffold_seq_lengths):
+        """
+        Samples as many decorations as scaffolds in the tensor.
+        :param scaffold_seqs: A tensor with the scaffolds to sample already encoded and padded.
+        :param scaffold_seq_lengths: A tensor with the length of the scaffolds.
+        :return: An iterator with (scaffold_smi, decoration_smi, nll) triplets.
+        """
+        batch_size = scaffold_seqs.size(0)
+        input_vector = torch.full(
+            (batch_size, 1), self.vocabulary.decoration_vocabulary["^"], dtype=torch.long).cuda()  # (batch, 1)
+        seq_lengths = torch.ones(batch_size)  # (batch)
+        encoder_padded_seqs, hidden_states = self.network.forward_encoder(scaffold_seqs, scaffold_seq_lengths)
+        nlls = torch.zeros(batch_size).cuda()
+        not_finished = torch.ones(batch_size, 1, dtype=torch.long).cuda()
+        sequences = []
+        for _ in range(self.max_sequence_length - 1):
+            logits, hidden_states, _ = self.network.forward_decoder(
+                input_vector, seq_lengths, encoder_padded_seqs, hidden_states)  # (batch, 1, voc)
+            probs = logits.softmax(dim=2).squeeze()  # (batch, voc)
+            log_probs = logits.log_softmax(dim=2).squeeze()  # (batch, voc)
+            input_vector = torch.multinomial(probs, 1)*not_finished  # (batch, 1)
+            sequences.append(input_vector)
+            nlls += self._nll_loss(log_probs, input_vector.squeeze())
+            not_finished = (input_vector > 1).type(torch.long)  # 0 is padding, 1 is end token
+            if not_finished.sum() == 0:
+                break
+
+        decoration_smiles = [self.vocabulary.decode_decoration(seq)
+                             for seq in torch.cat(sequences, 1).data.cpu().numpy()]
+        scaffold_smiles = [self.vocabulary.decode_scaffold(seq) for seq in scaffold_seqs.data.cpu().numpy()]
+        return zip(scaffold_smiles, decoration_smiles, nlls.data.cpu().numpy().tolist())
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/sampler.py b/SCMG/models/Reinvent_Scaffold_Decorator/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Reinvent_Scaffold_Decorator/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer/__init__.py b/SCMG/models/Transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..87cb7367b08ea138065cb521a41cd746f06b57ef
--- /dev/null
+++ b/SCMG/models/Transformer/__init__.py
@@ -0,0 +1 @@
+from .model import *
diff --git a/SCMG/models/Transformer/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d2712f8eda4d9d04ec9a27c39f87615e7cdcd1b
Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81652cccdc7c73478f43eb4e6156e47c3b89c8ab
Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer/__pycache__/model copy 3.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model copy 3.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0bbe759cc798a5e956d173f2439ae0d5e75e6ef7
Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model copy 3.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e9d890c989ba1479718238c89d8abc62083fa22
Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca7f600f46ec69862f287dab51151445c0d05a5b
Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ed2bc6c3cbce2c0be07be58a51ec73edc35f213
Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer/model copy 2.py b/SCMG/models/Transformer/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e2f3b32b2a6dcfeed7c7cb8a924e4ef420f9d8a
--- /dev/null
+++ b/SCMG/models/Transformer/model copy 2.py	
@@ -0,0 +1,175 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x_encoder,x_decoder, mask):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(mask == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = Attention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+    def forward(self, x):
+        x = self.ln1(x + self.attn(x,x))
+        x = self.ln2(x + self.mlp(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb_encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.tok_emb_decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb_encoder(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb_decoder(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer/model copy 3.py b/SCMG/models/Transformer/model copy 3.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c7c39ce3f736996efe25e2f470e6dfd4b76748
--- /dev/null
+++ b/SCMG/models/Transformer/model copy 3.py	
@@ -0,0 +1,179 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln3 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln3(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb_encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.tok_emb_decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb_encoder(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb_decoder(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer/model copy.py b/SCMG/models/Transformer/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e054937f34f6e76e5e04715e6e5c76e5b486ec3f
--- /dev/null
+++ b/SCMG/models/Transformer/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_encoder"]), bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer/model.py b/SCMG/models/Transformer/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e054937f34f6e76e5e04715e6e5c76e5b486ec3f
--- /dev/null
+++ b/SCMG/models/Transformer/model.py
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_encoder"]), bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer/sampler.py b/SCMG/models/Transformer/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_Test/__init__.py b/SCMG/models/Transformer_Test/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_Test/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_Test/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8db1dde5092caca6bf576f805bae30e4aada64f
Binary files /dev/null and b/SCMG/models/Transformer_Test/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_Test/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_Test/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e70ddb28cec728f2fb9abfd23c38420855f31920
Binary files /dev/null and b/SCMG/models/Transformer_Test/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_Test/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_Test/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10af8d4ec5cd0aaac36d5fb759eb33c5c5de1f45
Binary files /dev/null and b/SCMG/models/Transformer_Test/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_Test/model.py b/SCMG/models/Transformer_Test/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c38354e9b334af027d3507ea8c11f48a76fcd207
--- /dev/null
+++ b/SCMG/models/Transformer_Test/model.py
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(torch.cat([x_encoder,x_decoder],dim=1)).view(B_encoder, (T_encoder+T_decoder), self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(torch.cat([x_encoder,x_decoder],dim=1)).view(B_decoder, (T_encoder+T_decoder), self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:(T_encoder+T_decoder)] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_Test/sampler.py b/SCMG/models/Transformer_Test/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_Test/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_Torch/__init__.py b/SCMG/models/Transformer_Torch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_Torch/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_Torch/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b22844cf91a2d3edccd43a41f6314018ea8c6dfe
Binary files /dev/null and b/SCMG/models/Transformer_Torch/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_Torch/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_Torch/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2611e38b5ac471c7eb0304b2ba3d82087c838e53
Binary files /dev/null and b/SCMG/models/Transformer_Torch/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_Torch/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_Torch/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0490881eacb0a6ff919790ecbda9c0439dbdd85
Binary files /dev/null and b/SCMG/models/Transformer_Torch/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_Torch/model.py b/SCMG/models/Transformer_Torch/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5141298f1d2c558f1c37def377daaea42c2d551e
--- /dev/null
+++ b/SCMG/models/Transformer_Torch/model.py
@@ -0,0 +1,895 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+# from torch.nn import TransformerEncoder, TransformerEncoderLayer
+# from torch.nn import TransformerDecoder, TransformerDecoderLayer
+# from torch.nn import Transformer
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+
+import numbers
+import copy
+from typing import Optional, Any, Union, Callable,Tuple,List
+
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+from torch.nn import Module
+from torch.nn.modules.activation import MultiheadAttention
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.normalization import LayerNorm
+from torch.nn.parameter import Parameter
+from torch import Tensor, Size
+from torch.nn import init
+
+
+_shape_t = Union[int, List[int], Size]
+class LayerNorm(Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs as described in
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated over the last `D` dimensions, where `D`
+    is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape`
+    is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over
+    the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``).
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    The standard-deviation is calculated via the biased estimator, equivalent to
+    `torch.var(input, unbiased=False)`.
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+            .. math::
+                [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1]
+                    \times \ldots \times \text{normalized\_shape}[-1]]
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+            The values are initialized to 1.
+        bias:   the learnable bias of the module of shape
+                :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``.
+                The values are initialized to 0.
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+    Examples::
+        >>> # NLP Example
+        >>> batch, sentence_length, embedding_dim = 20, 5, 10
+        >>> embedding = torch.randn(batch, sentence_length, embedding_dim)
+        >>> layer_norm = nn.LayerNorm(embedding_dim)
+        >>> # Activate module
+        >>> layer_norm(embedding)
+        >>>
+        >>> # Image Example
+        >>> N, C, H, W = 20, 5, 10, 10
+        >>> input = torch.randn(N, C, H, W)
+        >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
+        >>> # as shown in the image below
+        >>> layer_norm = nn.LayerNorm([C, H, W])
+        >>> output = layer_norm(input)
+    .. image:: ../_static/img/nn/layer_norm.jpg
+        :scale: 50 %
+    """
+    __constants__ = ['normalized_shape', 'eps', 'elementwise_affine']
+    normalized_shape: Tuple[int, ...]
+    eps: float
+    elementwise_affine: bool
+
+    def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(LayerNorm, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            # mypy error: incompatible types in assignment
+            normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+        self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+            self.bias = Parameter(torch.empty(self.normalized_shape, **factory_kwargs))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.layer_norm(
+            input, self.normalized_shape, self.weight, self.bias, self.eps)
+
+    def extra_repr(self) -> str:
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
+
+
+class Linear(Module):
+    r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
+    This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
+    Args:
+        in_features: size of each input sample
+        out_features: size of each output sample
+        bias: If set to ``False``, the layer will not learn an additive bias.
+            Default: ``True``
+    Shape:
+        - Input: :math:`(*, H_{in})` where :math:`*` means any number of
+          dimensions including none and :math:`H_{in} = \text{in\_features}`.
+        - Output: :math:`(*, H_{out})` where all but the last dimension
+          are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
+    Attributes:
+        weight: the learnable weights of the module of shape
+            :math:`(\text{out\_features}, \text{in\_features})`. The values are
+            initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
+            :math:`k = \frac{1}{\text{in\_features}}`
+        bias:   the learnable bias of the module of shape :math:`(\text{out\_features})`.
+                If :attr:`bias` is ``True``, the values are initialized from
+                :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
+                :math:`k = \frac{1}{\text{in\_features}}`
+    Examples::
+        >>> m = nn.Linear(20, 30)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 30])
+    """
+    __constants__ = ['in_features', 'out_features']
+    in_features: int
+    out_features: int
+    weight: Tensor
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(Linear, self).__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
+        if bias:
+            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self) -> None:
+        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
+        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
+        # https://github.com/pytorch/pytorch/issues/57109
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.linear(input, self.weight, self.bias)
+
+    def extra_repr(self) -> str:
+        return 'in_features={}, out_features={}, bias={}'.format(
+            self.in_features, self.out_features, self.bias is not None
+        )
+
+# This class exists solely to avoid triggering an obscure error when scripting
+# an improperly quantized attention layer. See this issue for details:
+# https://github.com/pytorch/pytorch/issues/58969
+# TODO: fail fast on quantization API usage error, then remove this class
+# and replace uses of it with plain Linear
+class NonDynamicallyQuantizableLinear(Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None) -> None:
+        super().__init__(in_features, out_features, bias=bias,
+                         device=device, dtype=dtype)
+
+
+class MultiheadAttention(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+    where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
+    Args:
+        embed_dim: Total dimension of the model.
+        num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
+            across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
+        dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
+        bias: If specified, adds bias to input / output projection layers. Default: ``True``.
+        add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
+        add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
+            Default: ``False``.
+        kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
+        vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+    Examples::
+        >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
+    """
+    __constants__ = ['batch_first']
+    bias_k: Optional[torch.Tensor]
+    bias_v: Optional[torch.Tensor]
+
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
+                 kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(MultiheadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim is False:
+            self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
+            self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs))
+            self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs))
+            self.register_parameter('q_proj_weight', None)
+            self.register_parameter('k_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs)
+
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+            self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs))
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_proj_weight)
+            xavier_uniform_(self.k_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+
+        super(MultiheadAttention, self).__setstate__(state)
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
+                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        r"""
+    Args:
+        query: Query embeddings of shape :math:`(L, N, E_q)` when ``batch_first=False`` or :math:`(N, L, E_q)`
+            when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size,
+            and :math:`E_q` is the query embedding dimension ``embed_dim``. Queries are compared against
+            key-value pairs to produce the output. See "Attention Is All You Need" for more details.
+        key: Key embeddings of shape :math:`(S, N, E_k)` when ``batch_first=False`` or :math:`(N, S, E_k)` when
+            ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
+            :math:`E_k` is the key embedding dimension ``kdim``. See "Attention Is All You Need" for more details.
+        value: Value embeddings of shape :math:`(S, N, E_v)` when ``batch_first=False`` or :math:`(N, S, E_v)` when
+            ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and
+            :math:`E_v` is the value embedding dimension ``vdim``. See "Attention Is All You Need" for more details.
+        key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
+            to ignore for the purpose of attention (i.e. treat as "padding"). Binary and byte masks are supported.
+            For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
+            the purpose of attention. For a byte mask, a non-zero value indicates that the corresponding ``key``
+            value will be ignored.
+        need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
+            Default: ``True``.
+        attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
+            :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
+            :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
+            broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
+            Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
+            corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
+            corresponding position is not allowed to attend. For a float mask, the mask values will be added to
+            the attention weight.
+    Outputs:
+        - **attn_output** - Attention outputs of shape :math:`(L, N, E)` when ``batch_first=False`` or
+          :math:`(N, L, E)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is
+          the batch size, and :math:`E` is the embedding dimension ``embed_dim``.
+        - **attn_output_weights** - Attention output weights of shape :math:`(N, L, S)`, where :math:`N` is the batch
+          size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. Only returned
+          when ``need_weights=True``.
+        """
+        if self.batch_first:
+            query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+
+        if not self._qkv_same_embed_dim:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight)
+        else:
+            attn_output, attn_output_weights = F.multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask)
+        if self.batch_first:
+            return attn_output.transpose(1, 0), attn_output_weights
+        else:
+            return attn_output, attn_output_weights
+class Transformer(Module):
+    r"""A transformer model. User is able to modify the attributes as needed. The architecture
+    is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer,
+    Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and
+    Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information
+    Processing Systems, pages 6000-6010. Users can build the BERT(https://arxiv.org/abs/1810.04805)
+    model with corresponding parameters.
+    Args:
+        d_model: the number of expected features in the encoder/decoder inputs (default=512).
+        nhead: the number of heads in the multiheadattention models (default=8).
+        num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6).
+        num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of encoder/decoder intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        custom_encoder: custom encoder (default=None).
+        custom_decoder: custom decoder (default=None).
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+        norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before
+            other attention and feedforward operations, otherwise after. Default: ``False`` (after).
+    Examples::
+        >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
+        >>> src = torch.rand((10, 32, 512))
+        >>> tgt = torch.rand((20, 32, 512))
+        >>> out = transformer_model(src, tgt)
+    Note: A full example to apply nn.Transformer module for the word language model is available in
+    https://github.com/pytorch/examples/tree/master/word_language_model
+    """
+
+    def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6,
+                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1,
+                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+                 custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None,
+                 layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(Transformer, self).__init__()
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation, layer_norm_eps, batch_first, norm_first,
+                                                    **factory_kwargs)
+            encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation, layer_norm_eps, batch_first, norm_first,
+                                                    **factory_kwargs)
+            decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        self.batch_first = batch_first
+
+    def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Take in and process masked source/target sequences.
+        Args:
+            src: the sequence to the encoder (required).
+            tgt: the sequence to the decoder (required).
+            src_mask: the additive mask for the src sequence (optional).
+            tgt_mask: the additive mask for the tgt sequence (optional).
+            memory_mask: the additive mask for the encoder output (optional).
+            src_key_padding_mask: the ByteTensor mask for src keys per batch (optional).
+            tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional).
+            memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional).
+        Shape:
+            - src: :math:`(S, N, E)`, `(N, S, E)` if batch_first.
+            - tgt: :math:`(T, N, E)`, `(N, T, E)` if batch_first.
+            - src_mask: :math:`(S, S)`.
+            - tgt_mask: :math:`(T, T)`.
+            - memory_mask: :math:`(T, S)`.
+            - src_key_padding_mask: :math:`(N, S)`.
+            - tgt_key_padding_mask: :math:`(N, T)`.
+            - memory_key_padding_mask: :math:`(N, S)`.
+            Note: [src/tgt/memory]_mask ensures that position i is allowed to attend the unmasked
+            positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+            while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+            are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+            is provided, it will be added to the attention weight.
+            [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by
+            the attention. If a ByteTensor is provided, the non-zero positions will be ignored while the zero
+            positions will be unchanged. If a BoolTensor is provided, the positions with the
+            value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+            - output: :math:`(T, N, E)`, `(N, T, E)` if batch_first.
+            Note: Due to the multi-head attention architecture in the transformer model,
+            the output sequence length of a transformer is same as the input sequence
+            (i.e. target) length of the decode.
+            where S is the source sequence length, T is the target sequence length, N is the
+            batch size, E is the feature number
+        Examples:
+            >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask)
+        """
+
+        if not self.batch_first and src.size(1) != tgt.size(1):
+            raise RuntimeError("the batch number of src and tgt must be equal")
+        elif self.batch_first and src.size(0) != tgt.size(0):
+            raise RuntimeError("the batch number of src and tgt must be equal")
+
+        if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
+            raise RuntimeError("the feature number of src and tgt must be equal to d_model")
+
+        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
+        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
+                              tgt_key_padding_mask=tgt_key_padding_mask,
+                              memory_key_padding_mask=memory_key_padding_mask)
+        return output
+
+    @staticmethod
+    def generate_square_subsequent_mask(sz: int) -> Tensor:
+        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
+            Unmasked positions are filled with float(0.0).
+        """
+        return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)
+
+    def _reset_parameters(self):
+        r"""Initiate parameters in the transformer model."""
+
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+
+
+class TransformerEncoder(Module):
+    r"""TransformerEncoder is a stack of N encoder layers
+    Args:
+        encoder_layer: an instance of the TransformerEncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+        norm: the layer normalization component (optional).
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = transformer_encoder(src)
+    """
+    __constants__ = ['norm']
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the input through the encoder layers in turn.
+        Args:
+            src: the sequence to the encoder (required).
+            mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        output = src
+
+        for mod in self.layers:
+            output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(Module):
+    r"""TransformerDecoder is a stack of N decoder layers
+    Args:
+        decoder_layer: an instance of the TransformerDecoderLayer() class (required).
+        num_layers: the number of sub-decoder-layers in the decoder (required).
+        norm: the layer normalization component (optional).
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = transformer_decoder(tgt, memory)
+    """
+    __constants__ = ['norm']
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer in turn.
+        Args:
+            tgt: the sequence to the decoder (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        output = tgt
+
+        for mod in self.layers:
+            output = mod(output, memory, tgt_mask=tgt_mask,
+                         memory_mask=memory_mask,
+                         tgt_key_padding_mask=tgt_key_padding_mask,
+                         memory_key_padding_mask=memory_key_padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+class TransformerEncoderLayer(Module):
+    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
+    This standard encoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``.
+        norm_first: if ``True``, layer norm is done prior to attention and feedforward
+            operations, respectivaly. Otherwise it's done after. Default: ``False`` (after).
+    Examples::
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> out = encoder_layer(src)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> src = torch.rand(32, 10, 512)
+        >>> out = encoder_layer(src)
+    """
+    __constants__ = ['batch_first', 'norm_first']
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu,
+                 layer_norm_eps=1e-5, batch_first=False, norm_first=False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                            **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)
+
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        else:
+            self.activation = activation
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super(TransformerEncoderLayer, self).__setstate__(state)
+
+    def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequence to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask))
+            x = self.norm2(x + self._ff_block(x))
+
+        return x
+
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           need_weights=False)[0]
+        return self.dropout1(x)
+
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+
+
+class TransformerDecoderLayer(Module):
+    r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
+    This standard decoder layer is based on the paper "Attention Is All You Need".
+    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
+    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
+    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
+    in a different way during application.
+    Args:
+        d_model: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        dim_feedforward: the dimension of the feedforward network model (default=2048).
+        dropout: the dropout value (default=0.1).
+        activation: the activation function of the intermediate layer, can be a string
+            ("relu" or "gelu") or a unary callable. Default: relu
+        layer_norm_eps: the eps value in layer normalization components (default=1e-5).
+        batch_first: If ``True``, then the input and output tensors are provided
+            as (batch, seq, feature). Default: ``False``.
+        norm_first: if ``True``, layer norm is done prior to self attention, multihead
+            attention and feedforward operations, respectivaly. Otherwise it's done after.
+            Default: ``False`` (after).
+    Examples::
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
+        >>> memory = torch.rand(10, 32, 512)
+        >>> tgt = torch.rand(20, 32, 512)
+        >>> out = decoder_layer(tgt, memory)
+    Alternatively, when ``batch_first`` is ``True``:
+        >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True)
+        >>> memory = torch.rand(32, 10, 512)
+        >>> tgt = torch.rand(32, 20, 512)
+        >>> out = decoder_layer(tgt, memory)
+    """
+    __constants__ = ['batch_first', 'norm_first']
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu,
+                 layer_norm_eps=1e-5, batch_first=False, norm_first=False,
+                 device=None, dtype=None) -> None:
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                            **factory_kwargs)
+        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first,
+                                                 **factory_kwargs)
+        # Implementation of Feedforward model
+        self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs)
+        self.dropout = Dropout(dropout)
+        self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs)
+
+        self.norm_first = norm_first
+        self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.dropout3 = Dropout(dropout)
+
+        # Legacy string support for activation function.
+        if isinstance(activation, str):
+            self.activation = _get_activation_fn(activation)
+        else:
+            self.activation = activation
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super(TransformerDecoderLayer, self).__setstate__(state)
+
+    def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        r"""Pass the inputs (and mask) through the decoder layer.
+        Args:
+            tgt: the sequence to the decoder layer (required).
+            memory: the sequence from the last layer of the encoder (required).
+            tgt_mask: the mask for the tgt sequence (optional).
+            memory_mask: the mask for the memory sequence (optional).
+            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
+            memory_key_padding_mask: the mask for the memory keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf
+
+        x = tgt
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask)
+            x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask)
+            x = x + self._ff_block(self.norm3(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask))
+            x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask))
+            x = self.norm3(x + self._ff_block(x))
+
+        return x
+
+    # self-attention block
+    def _sa_block(self, x: Tensor,
+                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
+        x = self.self_attn(x, x, x,
+                           attn_mask=attn_mask,
+                           key_padding_mask=key_padding_mask,
+                           need_weights=False)[0]
+        return self.dropout1(x)
+
+    # multihead attention block
+    def _mha_block(self, x: Tensor, mem: Tensor,
+                   attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
+        x = self.multihead_attn(x, mem, mem,
+                                attn_mask=attn_mask,
+                                key_padding_mask=key_padding_mask,
+                                need_weights=False)[0]
+        return self.dropout2(x)
+
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+
+
+def _get_clones(module, N):
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+
+    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
+class PositionalEncoding(nn.Module):
+    def __init__(self,
+                 emb_size: int,
+                 dropout: float,
+                 maxlen: int = 200):
+        super(PositionalEncoding, self).__init__()
+        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
+        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
+        pos_embedding = torch.zeros((maxlen, emb_size))
+        pos_embedding[:, 0::2] = torch.sin(pos * den)
+        pos_embedding[:, 1::2] = torch.cos(pos * den)
+        pos_embedding = pos_embedding.unsqueeze(-2)
+
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer('pos_embedding', pos_embedding)
+
+    def forward(self, token_embedding: Tensor):
+        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_ATTENTION])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.transformer_model = Transformer(d_model=config[varables.DIM_ATTENTION],
+                                                nhead=config[varables.NUM_HEADS],
+                                                dim_feedforward=config[varables.DIM_FEEDFORWARD],
+                                                num_encoder_layers=config[varables.NUM_ENCODER_LAYERS],
+                                                num_decoder_layers=config[varables.NUM_DECODER_LAYERS],
+                                                dropout=config[varables.RATE_DROPOUT],
+                                                activation='gelu',
+                                                batch_first=True,
+                                                # device=config[varables.DEVICE]
+                                                )
+        self.generator = nn.Linear(config[varables.DIM_ATTENTION], config[varables.SIZE_VOCAB])
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = None
+            max_len = max([len(a) for a in x_in])
+            x_in = torch.tensor([a+[vocab[varables.TOKEN_PAD]]*(max_len-len(a)) for a in x_in],dtype=torch.long)
+            max_len = max([len(a) for a in y_in])
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len-len(a))) for a in y_in],dtype=torch.long)
+            return x_in,y,boundary
+        return collate
+    def forward(self, x_in,y_in, y_out=None,boundary=None):
+        _, t_x = x_in.size()
+        _, t_y = y_in.size()
+        x_token_embeddings = self.tok_emb(x_in)
+        y_token_embeddings = self.tok_emb(y_in)
+        x_position_embeddings = self.pos_emb[:, :t_x, :]
+        y_position_embeddings = self.pos_emb[:, :t_y, :]
+        x = self.drop(x_token_embeddings + x_position_embeddings)
+        y = self.drop(y_token_embeddings + y_position_embeddings)
+        decoder_mask = self.transformer_model.generate_square_subsequent_mask(t_y).to(y_in.device)
+        # x = x.transpose(1,0)
+        # x = x.permute(1,0)
+        out = self.transformer_model(x,y,tgt_mask=decoder_mask)
+        logits = self.generator(out)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+#
diff --git a/SCMG/models/Transformer_Torch/sampler.py b/SCMG/models/Transformer_Torch/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23455c94a16e1150337d6546e06ff377d2840d5
--- /dev/null
+++ b/SCMG/models/Transformer_Torch/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):x
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug copy/__init__.py b/SCMG/models/Transformer_debug copy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug copy/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug copy/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..854980ad97e8eb0251b4f065dd8e01e4be27bd03
Binary files /dev/null and b/SCMG/models/Transformer_debug copy/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug copy/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug copy/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed61eeccea28a6e038f305c849eb4880e426d497
Binary files /dev/null and b/SCMG/models/Transformer_debug copy/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug copy/model copy 2.py b/SCMG/models/Transformer_debug copy/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug copy/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug/__init__.py b/SCMG/models/Transformer_debug/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df47ecfe25d9c6962cf6ac035c2f45f5bb69f9a4
Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5003b9a4d37a4a31c4a9d3f12fa20b9eb69c37a4
Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc05e2dc483ae6919a1d00d1678a15057f11247
Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d89af47faf24df819c1dff62a7e5216fed49311e
Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81632f9311fa817ec3b38eeebebe3d15e43abab0
Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug/model copy 2.py b/SCMG/models/Transformer_debug/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug/model copy.py b/SCMG/models/Transformer_debug/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug/model.py b/SCMG/models/Transformer_debug/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7845521bf9f5f226fc10a47720fef0e9b6d7cd8
--- /dev/null
+++ b/SCMG/models/Transformer_debug/model.py
@@ -0,0 +1,275 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+# class PositionalEncoder(nn.Module):
+#     def __init__(self, config):
+#         super().__init__()
+#         pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION])
+#         for pos in range(config[varables.SIZE_BLOCK]):
+#             for i in range(0, config[varables.DIM_ATTENTION], 2):
+#                 pe[pos, i] = \
+#                 math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION])))
+#                 pe[pos, i + 1] = \
+#                 math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION])))                
+#         pe = pe.unsqueeze(0)
+#         self.register_buffer('pe', pe)
+#     def forward(self, T):
+#         #add constant to embedding
+#         x = Variable(self.pe[:,:T], requires_grad=False)
+#         return x
+
+
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Dropout1(self.Attention(  X_Encoder, X_Encoder, Mask_Encoder)))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.Dropout2(self.FeedForward(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.Dropout1(self.AttentionMasked(X_Decoder, X_Decoder, Mask_Decoder)))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.Dropout2(self.AttentionCross (X_Encoder, X_Decoder, Mask_Cross)))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.Dropout3(self.FeedForward(               X_Decoder              )))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        X_Encoder = self.LayerNorm1(X_Encoder)
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # Decoder blocks
+        X_Decoder = self.LayerNorm2(X_Decoder)
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+
+
+
+
+
+
+
+
+
+
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug/sampler.py b/SCMG/models/Transformer_debug/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug2 copy/__init__.py b/SCMG/models/Transformer_debug2 copy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a26e17bc9adfde2f235fb28f005491923c2839ed
Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85ce1889aed10a33dcfafb5baac060711904bb26
Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f368fbe64e9c3d744c1185c7bd01619da5448da0
Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce98387b4d198842c3e809383a268f4ab9e99b58
Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64918d3bb932c62cfeb87135b069381c5fac1f5e
Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2 copy/model copy 2.py b/SCMG/models/Transformer_debug2 copy/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug2 copy/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug2 copy/model copy.py b/SCMG/models/Transformer_debug2 copy/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug2 copy/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug2 copy/model.py b/SCMG/models/Transformer_debug2 copy/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d93c209eece94f1cbce12f9907ce3edf18017f6f
--- /dev/null
+++ b/SCMG/models/Transformer_debug2 copy/model.py	
@@ -0,0 +1,278 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+# class PositionalEncoder(nn.Module):
+#     def __init__(self, config):
+#         super().__init__()
+#         pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION])
+#         for pos in range(config[varables.SIZE_BLOCK]):
+#             for i in range(0, config[varables.DIM_ATTENTION], 2):
+#                 pe[pos, i] = \
+#                 math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION])))
+#                 pe[pos, i + 1] = \
+#                 math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION])))                
+#         pe = pe.unsqueeze(0)
+#         self.register_buffer('pe', pe)
+#     def forward(self, T):
+#         #add constant to embedding
+#         x = Variable(self.pe[:,:T], requires_grad=False)
+#         return x
+
+
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.Dropout1(X_Encoder + self.Attention  (self.LayerNorm1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.Dropout2(X_Encoder + self.FeedForward(self.LayerNorm2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross (                X_Encoder,  self.LayerNorm2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.Dropout3(X_Decoder + self.FeedForward    (self.LayerNorm3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+
+
+
+
+
+
+
+
+
+
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug2 copy/sampler.py b/SCMG/models/Transformer_debug2 copy/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug2 copy/sampler.py	
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug2/__init__.py b/SCMG/models/Transformer_debug2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug2/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f1f423a143412b6be2ebd2b886b4a42ee570950
Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..193c50af6a84dfa33e102f8745bfe92e0d2307fb
Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72f2ea02a6efe9f2dbb7ff61c903adcfbc00bb2a
Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c104e3963e9e4b5c6f346c3c913a91b31996503a
Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..121749b9d57b595bd84555e9d479c65a4e787a61
Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug2/model copy 2.py b/SCMG/models/Transformer_debug2/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug2/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug2/model copy.py b/SCMG/models/Transformer_debug2/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug2/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug2/model.py b/SCMG/models/Transformer_debug2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3190f0c3ef2b4e5f0f10aea05f644b7514b73ad
--- /dev/null
+++ b/SCMG/models/Transformer_debug2/model.py
@@ -0,0 +1,246 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.Dropout1(X_Encoder + self.Attention  (self.LayerNorm1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.Dropout2(X_Encoder + self.FeedForward(self.LayerNorm2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross (                X_Encoder,  self.LayerNorm2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.Dropout3(X_Decoder + self.FeedForward    (self.LayerNorm3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug2/sampler.py b/SCMG/models/Transformer_debug2/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug2/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug3/__init__.py b/SCMG/models/Transformer_debug3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug3/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45e210b6d15ad235e89217d27cfaa8c06fb1fc8c
Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug3/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9539e1e0a1cd72ba4f1393948122ccbb8788a395
Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug3/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39991c790e37f0536bf046f6521ecb60bdf104c0
Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug3/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7700db3b42c92211764fb4960ab77403a185fdf3
Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug3/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4a7514350571c1f5cd6ca4f86c5e4c52b4ec339
Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug3/model copy 2.py b/SCMG/models/Transformer_debug3/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug3/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug3/model copy.py b/SCMG/models/Transformer_debug3/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug3/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug3/model.py b/SCMG/models/Transformer_debug3/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..89dc6f950abf5fdada62d6ccf8a03a35a023838c
--- /dev/null
+++ b/SCMG/models/Transformer_debug3/model.py
@@ -0,0 +1,246 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug3/sampler.py b/SCMG/models/Transformer_debug3/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug3/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug4/__init__.py b/SCMG/models/Transformer_debug4/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug4/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e81dabbc6858842866a25fce6f5a687851f4bdc0
Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug4/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1eb8e0473949361ebb286e754b300fe629ae758f
Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug4/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59ec6adc363dace3cdf8e4b624d26e586ecc2c6e
Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug4/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42e7a52dc36efcd93f42fceb1fadbae91367108e
Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug4/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b54c1a446a8731d09ddfa65733a51997720df20
Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug4/model copy 2.py b/SCMG/models/Transformer_debug4/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug4/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug4/model copy.py b/SCMG/models/Transformer_debug4/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug4/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug4/model.py b/SCMG/models/Transformer_debug4/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fcb7fff9780ec73f3d5c97609a0ded244d4fb6f
--- /dev/null
+++ b/SCMG/models/Transformer_debug4/model.py
@@ -0,0 +1,246 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Embedding = config[varables.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug4/sampler.py b/SCMG/models/Transformer_debug4/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug4/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug5 copy/__init__.py b/SCMG/models/Transformer_debug5 copy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67aab376b9f41c82542940cf66ac782c49b5b095
Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..129871a4e252bdddf64e15662997b5fac4f2bfd0
Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8953b8d78bbd2dc0497858c6c99177a3d942936
Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db32e645e4f11379b26cae6e0841a1539e6cc8aa
Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ecda0accfc6a66be21620b0fed9e3e41bcc497a
Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5 copy/model copy 2.py b/SCMG/models/Transformer_debug5 copy/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug5 copy/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug5 copy/model copy.py b/SCMG/models/Transformer_debug5 copy/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug5 copy/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug5 copy/model.py b/SCMG/models/Transformer_debug5 copy/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b197b2041a497d9f3b929a9e73ec0d267167ab
--- /dev/null
+++ b/SCMG/models/Transformer_debug5 copy/model.py	
@@ -0,0 +1,249 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Embedding = config[varables.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss = F.nll_loss(F.log_softmax(Y_Decoder_Logits).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug5 copy/sampler.py b/SCMG/models/Transformer_debug5 copy/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug5 copy/sampler.py	
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug5/__init__.py b/SCMG/models/Transformer_debug5/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug5/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f4ce893aa53a9dc96895f7acf5c827cc60d3e1c
Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f5ed60720263fd980189ce4e223ee6c8dfdd401
Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bfa1b683dc47e5ff8a5279b1c3e5ea64eeec8aa1
Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af9ac0f069787a05b4921c07a3c540c1712368e2
Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f9b18a160658196fcfc18fdf806256ed172319a
Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug5/model copy 2.py b/SCMG/models/Transformer_debug5/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug5/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug5/model copy.py b/SCMG/models/Transformer_debug5/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug5/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug5/model.py b/SCMG/models/Transformer_debug5/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cba0822981298a6d9c005956bf3f2f09cffda00
--- /dev/null
+++ b/SCMG/models/Transformer_debug5/model.py
@@ -0,0 +1,249 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Embedding = config[varables.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug5/sampler.py b/SCMG/models/Transformer_debug5/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug5/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug6/__init__.py b/SCMG/models/Transformer_debug6/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug6/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f10947e7e58bff44a1ab74857fa90f26501f659
Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug6/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d7cea1e4d376ba38a421edbc4d0780c65ba03bb
Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug6/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90f730deaeb8c96ef171c73eebd4cdf347e04c53
Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug6/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0302bd9b2cb1b0f7340e80e3bddfcbf30178baf6
Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug6/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce5084065c1b9fc09cece653c844a7bae1761ae9
Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug6/model copy 2.py b/SCMG/models/Transformer_debug6/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_debug6/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug6/model copy.py b/SCMG/models/Transformer_debug6/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_debug6/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug6/model.py b/SCMG/models/Transformer_debug6/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..590d2311f738ae72ec2f96833d49a1912944755c
--- /dev/null
+++ b/SCMG/models/Transformer_debug6/model.py
@@ -0,0 +1,249 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Embedding = config[varables.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss1 = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits))
+        return Y_Decoder_Logits, loss1+loss2
+
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/Transformer_debug6/sampler.py b/SCMG/models/Transformer_debug6/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_debug6/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/Transformer_debug7/__init__.py b/SCMG/models/Transformer_debug7/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug7/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug7/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29099ec7592db2013e76a7bc4927092d1f6b863b
Binary files /dev/null and b/SCMG/models/Transformer_debug7/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug7/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug7/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecc0a192b0932cb2f168d0801551de57acbf6bfd
Binary files /dev/null and b/SCMG/models/Transformer_debug7/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug7/model.py b/SCMG/models/Transformer_debug7/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7728093143a64c981c21da51a2ab03af262e6df8
--- /dev/null
+++ b/SCMG/models/Transformer_debug7/model.py
@@ -0,0 +1,233 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Embedding = config[varables.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+        
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss1 = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits))
+        return Y_Decoder_Logits, loss1+loss2
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug8/__init__.py b/SCMG/models/Transformer_debug8/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug8/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug8/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d605ab90fb3d410739fee3b7401a585edb7f44b
Binary files /dev/null and b/SCMG/models/Transformer_debug8/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug8/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug8/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f35af746c45f932d67cd854ebcb8fcc8704c4c2b
Binary files /dev/null and b/SCMG/models/Transformer_debug8/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug8/model.py b/SCMG/models/Transformer_debug8/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc61da6e3d71be959658010e6639179b3c8f2425
--- /dev/null
+++ b/SCMG/models/Transformer_debug8/model.py
@@ -0,0 +1,245 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Embedding = config[varables.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = [a[2] for a in results]
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD] for _ in range(max_len_x-len(a))]) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD] for _ in range(max_len_y-len(a))]) for a in X_Decoder],dtype=torch.long)
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD] for _ in range(max_len_x-len(a))]) for a in X_Encoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def customize_fn(self,diex):
+        bos_token = diex[VBS.COLUMN_TASK_TYPE]
+        x_in = self.tokenizer(diex[VBS.COLUMN_ENCODER])
+        y_in = self.tokenizer(diex[VBS.COLUMN_DECODER])
+        if len(x_in)>0:
+            x_in = [bos_token] + x_in + [VBS.TOKEN_END]
+        y_in = [bos_token] + y_in + [VBS.TOKEN_END]
+        x_in = [self.vocab_encoder[a] if a in self.vocab_encoder.keys() else self.vocab_encoder["<unk>"] for a in x_in ]
+        y_in = [self.vocab_decoder[a] if a in self.vocab_decoder.keys() else self.vocab_decoder["<unk>"] for a in y_in ]
+        boundary = len(x_in)+1
+        return x_in,y_in,boundary
+
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss1 = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+            loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits))
+        return Y_Decoder_Logits, loss1+loss2
\ No newline at end of file
diff --git a/SCMG/models/Transformer_debug9/__init__.py b/SCMG/models/Transformer_debug9/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_debug9/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug9/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9faba93282737bc6341ad1a160c21f0ef9a54a0
Binary files /dev/null and b/SCMG/models/Transformer_debug9/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug9/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug9/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb3989b462ce7c3e35e4925ddba0e7e59ce8debc
Binary files /dev/null and b/SCMG/models/Transformer_debug9/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_debug9/model.py b/SCMG/models/Transformer_debug9/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..813184b65ea5a370097a352b0db0487fe992264e
--- /dev/null
+++ b/SCMG/models/Transformer_debug9/model.py
@@ -0,0 +1,364 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import partialsmiles as ps
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables as VBS
+from torch.autograd import Variable
+import partialsmiles as ps
+from SCMG.utils.utils_rsd import *
+from rdkit import Chem
+from rdkit import RDLogger 
+RDLogger.DisableLog('rdApp.*')
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[VBS.RATE_DROPOUT])
+        max_len = config[VBS.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[VBS.DIM_EMBEDDING])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[VBS.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[VBS.DIM_EMBEDDING]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[VBS.DIM_ATTENTION] % config[VBS.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[VBS.DIM_EMBEDDING], config[VBS.DIM_ATTENTION])
+        self.Query = nn.Linear(config[VBS.DIM_EMBEDDING], config[VBS.DIM_ATTENTION])
+        self.Value = nn.Linear(config[VBS.DIM_EMBEDDING], config[VBS.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[VBS.DIM_ATTENTION], config[VBS.DIM_EMBEDDING])
+        self.NumberOfHeads = config[VBS.NUM_HEADS]
+        self.DimHead = config[VBS.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[VBS.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[VBS.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[VBS.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[VBS.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[VBS.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[VBS.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[VBS.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Attention       = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,Mask_Encoder):
+        X_Encoder = self.LayerNorm1(X_Encoder + self.Attention  (self.Dropout1(X_Encoder), None, Mask_Encoder))
+        X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder)))
+        return X_Encoder
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross (                X_Encoder,  self.Dropout2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward    (self.Dropout3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # VBS
+        self.Dim_Embedding = config[VBS.DIM_EMBEDDING]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[VBS.DIM_EMBEDDING])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[VBS.DIM_EMBEDDING])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[VBS.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[VBS.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[VBS.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[VBS.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[VBS.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        self.Alpha_LabelSmoothing = None
+        self.TokenWeight = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _set_train_params(self,Config):
+        self.Alpha_LabelSmoothing = Config["Alpha_LabelSmoothing"]
+        self.TokenWeight = Config["TokenWeight"]
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[VBS.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[VBS.SIZE_STEP], gamma=train_config[VBS.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder           = [a[0] for a in results]
+            X_Decoder           = [a[1] for a in results]
+            Auxiliary           = [a[2] for a in results]
+            #
+            max_len_x           = max([len(a) for a in X_Encoder])
+            max_len_y           = max([len(a) for a in X_Decoder])
+            #
+            x                   = torch.tensor([(a+[vocab_encoder[VBS.TOKEN_PAD] for _ in range(max_len_x-len(a))]) for a in X_Encoder],dtype=torch.long)
+            y                   = torch.tensor([(a+[vocab_decoder[VBS.TOKEN_PAD] for _ in range(max_len_y-len(a))]) for a in X_Decoder],dtype=torch.long)
+            if isinstance(Auxiliary[0],list):
+                MaxLen_Auxiliary    = max([len(TruthTable) for TruthTable in Auxiliary])
+                Len_Vocab           = len(self.List_Vocab_Decoder)
+                Auxiliary           = torch.tensor([TruthTable+[[0 for _ in range(Len_Vocab)] for _ in range(MaxLen_Auxiliary-len(TruthTable))] for TruthTable in Auxiliary])
+            ##
+            #
+            return x,y,Auxiliary
+        return collate
+    def customize_model_fn(self,diex):
+        def fn(diex):
+            bos_token = diex[VBS.COLUMN_TASK_TYPE]
+            # Encoder
+            x_in = self.tokenizer(diex[VBS.COLUMN_ENCODER])
+            if len(x_in)>0:
+                x_in = [bos_token] + x_in + [VBS.TOKEN_END]
+            x_in = [self.vocab_encoder[a] if a in self.vocab_encoder.keys() else self.vocab_encoder["<unk>"] for a in x_in ]
+            # Decoder
+            y_in = self.tokenizer(diex[VBS.COLUMN_DECODER])
+            y_in = [bos_token] + y_in + [VBS.TOKEN_END]
+            # Auxiliary
+            ## 1. partial
+            ## Is Valid
+            TruthTable = []
+            for CurrentIndex in range(1,len(y_in)):
+                if (y_in[CurrentIndex] == "|" or "<" in y_in[CurrentIndex]) and y_in[CurrentIndex] != VBS.TOKEN_END:
+                    TruthTable.append([0 for _ in range(len(self.List_Vocab_Decoder))])
+                    continue
+                CurrentTruthTable = []
+                for CurrentToken in self.List_Vocab_Decoder:
+                    try:
+                        _ = ps.ParseSmiles("".join(y_in[1:CurrentIndex])+CurrentToken, partial=True)
+                        IsValid = 1
+                    except:
+                        IsValid = 0
+                    if CurrentToken == VBS.TOKEN_END:
+                        CurrentSMI = join_scaf_deco(diex[VBS.COLUMN_ENCODER],"".join(y_in[1:CurrentIndex]))
+                        if len(CurrentSMI) > 0:
+                            IsValid = 1
+                    CurrentTruthTable.append(IsValid)
+                TruthTable.append(CurrentTruthTable)
+                # StrPrint = "".join([f"{a:3}" for a in TruthTable])
+                # print(f'''{y_in[i][:5]:5} {StrPrint}''')
+            y_in = [self.vocab_decoder[a] if a in self.vocab_decoder.keys() else self.vocab_decoder["<unk>"] for a in y_in ]
+            Auxiliary = TruthTable
+            return x_in,y_in,Auxiliary
+        return fn
+    def generate_masks(self,X_Encoder, X_Decoder):
+        with torch.no_grad():
+            # Generate encoder, decoder, cross masks
+            T = X_Decoder.shape[1]
+            Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+            Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+            Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+            mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+            Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,Auxiliary=None):
+        Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1)))
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Encoder blocks
+        for encoder_block in self.encoder_blocks:
+            X_Encoder = encoder_block(X_Encoder,Mask_Encoder)
+        # X_Encoder = self.LayerNorm1(X_Encoder)
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder)
+        # X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder)
+        loss = None
+        if Y_Decoder_Ref is not None:
+            with torch.no_grad():
+                Y_OneHot = F.one_hot(Y_Decoder_Ref, num_classes=len(self.vocab_decoder)) * (1-self.Alpha_LabelSmoothing)
+                # LabelSmooth
+                LabelSmooth = torch.ones(len(self.List_Vocab_Decoder),device = Y_Decoder_Ref.device) * self.Alpha_LabelSmoothing / (len(self.List_Vocab_Decoder)-1)
+                Y_OneHot = Y_OneHot + LabelSmooth
+                # PartialSMILES
+                TruthTables = Auxiliary
+                Y_OneHot = Y_OneHot * TruthTables
+                # TokenWeight
+                if self.TokenWeight is not None:
+                    Weight = torch.tensor(
+                        self.TokenWeight,
+                        device = Y_Decoder_Ref.device).unsqueeze(0).unsqueeze(0)
+                    Y_OneHot = Y_OneHot * Weight
+                # IgnoreIndex
+                Y_OneHot[Y_Decoder_Ref==self.Token_Padding_Decoder] = 0.
+            Y_Decoder_Logits_LogSoftmax = F.log_softmax(Y_Decoder_Logits,dim=-1)
+            loss = -(Y_OneHot * Y_Decoder_Logits_LogSoftmax).sum(dim=-1)
+            loss = loss.mean()
+            # loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits))
+        return Y_Decoder_Logits, loss
+
+
+
+# self = trainer.model_module
+# X_Encoder = trainer.X_Encoder
+# X_Decoder = trainer.X_Decoder
+# Y_Decoder_Ref = trainer.Y_Decoder_Ref
+# Auxiliary = trainer.Auxiliary
+
+# from torch.nn import functional as F
+# Y_OneHot = F.one_hot(trainer.Y_Decoder_Ref,num_classes=len(trainer.model.vocab_decoder))
+# import math
+# import logging
+# import torch
+# import torch.nn as nn
+# from torch.nn import functional as F
+# # logger = logging.getLogger(__name__)
+# from SCMG.config import varables as VBS
+# from torch.autograd import Variable
+
+# from SmilesPE.pretokenizer  import atomwise_tokenizer
+# class debug1():
+#     def __init__(self):
+#         self.tokenizer = atomwise_tokenizer
+#         self.vocab_encoder = torch.load("vocab_atom.pt")
+#         self.vocab_decoder = torch.load("vocab_atom.pt")
+    
+
+# self = debug1()
+# bos_token = "bos_token"
+# diex={
+#     VBS.COLUMN_ENCODER:"[*]c1cc(NC(=O)c2ccccc2)ccc1F",
+#     VBS.COLUMN_DECODER:"[*]c1cc(NC(=O)c2cc3c(cn2)OCCO3)ccc1F",
+#     VBS.COLUMN_TASK_TYPE:"<scmg_char_rand>",
+#     VBS.TOKEN_END:"<pad>",
+# }
+# customize_model_fn(self,diex)
+
+
+# rm -r checkpoints/TFdebug9_512_512_6_20220401_0
+# python -i scripts/create_model_SCMG.py \
+#   --model_type=Transformer_debug9 \
+#   --model_name=TF_512_512_6_debug9 \
+#   --num_decoder_layers=6 \
+#   --num_heads=8 \
+#   --dim_attention=512 \
+#   --dim_feedforward=2048 \
+#   --dim_embedding=512 \
+#   --rate_dropout=0.2 \
+#   --tokenizer=atom \
+#   --size_block=300 \
+#   --filepath_vocab_encoder=vocab_atom.pt \
+#   --filepath_vocab_decoder=vocab_atom.pt \
+#   --dirpath_checkpoint=checkpoints/TFdebug9_512_512_6_20220401_0
+
+# python \
+#   -i \
+#   scripts/train/train_SCMG.py \
+#   --dirpath_data=PreProcess_DecoderOnly/TrainingSets_EncoderDecoder_OneDecoder/ \
+#   --size_batch=192 \
+#   --size_step=1500 \
+#   --rate_learning=0.0001 \
+#   --gamma=0.1 \
+#   --num_workers=32 \
+#   --epochs=49 \
+#   --dirpath_checkpoint=checkpoints/TFdebug9_512_512_6_20220401_0/ \
+#   --log_level=INFO \
+#   --run_one_epoch=0 \
+#   --dry_run=0 \
+#   --dump=1 \
+#   --Alpha_LabelSmoothing=0.1
\ No newline at end of file
diff --git a/SCMG/models/Transformer_ref/__init__.py b/SCMG/models/Transformer_ref/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/Transformer_ref/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93670b84f2d329c5471f40539da29b6a944d65c0
Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_ref/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4ff76c3563838c2acaaa6c601de4bef414b594d
Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_ref/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b85ff8f00454f4d18df378f0ec321b9e2eb255f9
Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_ref/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1439f107862502f9db1af61aee1b966f145584f4
Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/Transformer_ref/model copy.py b/SCMG/models/Transformer_ref/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/Transformer_ref/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_ref/model.py b/SCMG/models/Transformer_ref/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/Transformer_ref/model.py
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/Transformer_ref/sampler.py b/SCMG/models/Transformer_ref/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/Transformer_ref/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/UTFMG/__init__.py b/SCMG/models/UTFMG/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/UTFMG/__pycache__/__init__.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfc264b2a6beca6d69f3c33d2c4d07133347a8f0
Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG/__pycache__/config.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a84a4d30565d0239af5960f63619d6eb331f557f
Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/config.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG/__pycache__/model.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18f9508e356c13912d917742f2a18486b94c31ae
Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG/__pycache__/sampler.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d8b4302ef6fdd87e2ad670175c89d8f4c473a69
Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG/config.py b/SCMG/models/UTFMG/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..94d854ccf0a67422a41f53a05eb3e4e103a45f18
--- /dev/null
+++ b/SCMG/models/UTFMG/config.py
@@ -0,0 +1,39 @@
+import argparse
+
+
+def get_parser(parser=None):
+    if parser is None:
+        parser = argparse.ArgumentParser()
+
+    # Model
+    model_arg = parser.add_argument_group('Model')
+    model_arg.add_argument("--num_layers", type=int, default=3,
+                           help="Number of LSTM layers")
+    model_arg.add_argument("--hidden", type=int, default=768,
+                           help="Hidden size")
+    model_arg.add_argument("--dropout", type=float, default=0.2,
+                           help="dropout between LSTM layers except for last")
+
+    # Train
+    train_arg = parser.add_argument_group('Training')
+    train_arg.add_argument('--train_epochs', type=int, default=80,
+                           help='Number of epochs for model training')
+    train_arg.add_argument('--n_batch', type=int, default=64,
+                           help='Size of batch')
+    train_arg.add_argument('--lr', type=float, default=1e-3,
+                           help='Learning rate')
+    train_arg.add_argument('--step_size', type=int, default=10,
+                           help='Period of learning rate decay')
+    train_arg.add_argument('--gamma', type=float, default=0.5,
+                           help='Multiplicative factor of learning rate decay')
+    train_arg.add_argument('--n_jobs', type=int, default=1,
+                           help='Number of threads')
+    train_arg.add_argument('--n_workers', type=int, default=1,
+                           help='Number of workers for DataLoaders')
+
+    return parser
+
+
+def get_config():
+    parser = get_parser()
+    return parser.parse_known_args()[0]
diff --git a/SCMG/models/UTFMG/model.py b/SCMG/models/UTFMG/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ccdc295356f91c91f87e264e31f59782249c4c8
--- /dev/null
+++ b/SCMG/models/UTFMG/model.py
@@ -0,0 +1,133 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None,boundary=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        if boundary is None:
+            att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        else:
+            mask = torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))\
+                            .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])\
+                            .repeat(B,1,1,1)
+            for i in range(len(boundary)):
+                mask[i,0,:boundary[i],::boundary[i]] = 1
+            att = att.masked_fill(mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+class Block(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x, boundary):
+         # = y_input
+        x = x + self.attn(self.ln1(x),boundary)
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[Block(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab,vocab2):
+        def collate(results):
+            x_in = None
+            y_in = [a[0] + [vocab[varables.TOKEN_SEP]] + a[1] for a in results]
+            boundary = [a[2] for a in results]
+            max_len = max([len(a) for a in y_in])
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len-len(a))) for a in y_in],dtype=torch.long)
+            return x_in,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        b, t = y_in.size()
+        assert t <= self.block_size
+        token_embeddings = self.tok_emb(y_in)
+        position_embeddings = self.pos_emb[:, :t, :]
+        x = self.drop(token_embeddings + position_embeddings)
+        for block in self.blocks:
+            x = block(x,boundary)
+        x = self.ln_f(x)
+        logits = self.head(x)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
diff --git a/SCMG/models/UTFMG/sampler.py b/SCMG/models/UTFMG/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/UTFMG/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/UTFMG2/__init__.py b/SCMG/models/UTFMG2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/SCMG/models/UTFMG2/__pycache__/__init__.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d2222dc45997b8d1fbc4ed7b4533e2cec8ca4a4
Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/__init__.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG2/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/model copy 2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f27722a68d159b58748f5162c2558768f008f
Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/model copy 2.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG2/__pycache__/model copy.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/model copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43d95eca0861879943db41d5c6171fb4cf48f2f7
Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/model copy.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG2/__pycache__/model.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..758d86f65d5be18c42aece49ea8541107eb339f2
Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/model.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG2/__pycache__/sampler.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/sampler.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b60eb1e593e7e5687f7467a526d4616c061305f
Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/sampler.cpython-310.pyc differ
diff --git a/SCMG/models/UTFMG2/model copy 2.py b/SCMG/models/UTFMG2/model copy 2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260
--- /dev/null
+++ b/SCMG/models/UTFMG2/model copy 2.py	
@@ -0,0 +1,420 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+
+class Norm(nn.Module):
+    def __init__(self, d_model, eps = 1e-6):
+        super().__init__()
+    
+        self.size = d_model
+        
+        # create two learnable parameters to calibrate normalisation
+        self.alpha = nn.Parameter(torch.ones(self.size))
+        self.bias = nn.Parameter(torch.zeros(self.size))
+        
+        self.eps = eps
+    
+    def forward(self, x):
+        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
+        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
+        return norm
+
+def attention(q, k, v, d_k, mask=None, dropout=None):
+    
+    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
+    
+    if mask is not None:
+        mask = mask.unsqueeze(1)
+        scores = scores.masked_fill(mask == 0, -1e9)
+    
+    scores = F.softmax(scores, dim=-1)
+    
+    if dropout is not None:
+        scores = dropout(scores)
+        
+    output = torch.matmul(scores, v)
+    return output
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, heads, d_model, dropout = 0.1):
+        super().__init__()
+        
+        self.d_model = d_model
+        self.d_k = d_model // heads
+        self.h = heads
+        
+        self.q_linear = nn.Linear(d_model, d_model)
+        self.v_linear = nn.Linear(d_model, d_model)
+        self.k_linear = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(d_model, d_model)
+    
+    def forward(self, q, k, v, mask=None):
+        
+        bs = q.size(0)
+        
+        # perform linear operation and split into N heads
+        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
+        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
+        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
+        
+        # transpose to get dimensions bs * N * sl * d_model
+        k = k.transpose(1,2)
+        q = q.transpose(1,2)
+        v = v.transpose(1,2)
+        
+
+        # calculate attention using function we will define next
+        scores = attention(q, k, v, self.d_k, mask, self.dropout)
+        # concatenate heads and put through final linear layer
+        concat = scores.transpose(1,2).contiguous()\
+        .view(bs, -1, self.d_model)
+        output = self.out(concat)
+    
+        return output
+
+class FeedForward(nn.Module):
+    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
+        super().__init__() 
+    
+        # We set d_ff as a default to 2048
+        self.linear_1 = nn.Linear(d_model, d_ff)
+        self.dropout = nn.Dropout(dropout)
+        self.linear_2 = nn.Linear(d_ff, d_model)
+    
+    def forward(self, x):
+        x = self.dropout(F.relu(self.linear_1(x)))
+        x = self.linear_2(x)
+        return x
+
+
+
+
+import torch
+import torch.nn as nn 
+import copy
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        
+    def forward(self, x, mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.ff(x2))
+        return x
+    
+# build a decoder layer with two multi-head attention layers and
+# one feed-forward layer
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, heads, dropout=0.1):
+        super().__init__()
+        self.norm_1 = Norm(d_model)
+        self.norm_2 = Norm(d_model)
+        self.norm_3 = Norm(d_model)
+        
+        self.dropout_1 = nn.Dropout(dropout)
+        self.dropout_2 = nn.Dropout(dropout)
+        self.dropout_3 = nn.Dropout(dropout)
+        
+        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
+        self.ff = FeedForward(d_model, dropout=dropout)
+
+    def forward(self, x, e_outputs, src_mask, trg_mask):
+        x2 = self.norm_1(x)
+        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
+        x2 = self.norm_2(x)
+        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
+        src_mask))
+        x2 = self.norm_3(x)
+        x = x + self.dropout_3(self.ff(x2))
+        return x
+
+
+import torch
+import torch.nn as nn
+import math
+from torch.autograd import Variable
+
+class Embedder(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.d_model = d_model
+        self.embed = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.embed(x)
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, d_model, max_seq_len = 200, dropout = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = nn.Dropout(dropout)
+        # create constant 'pe' matrix with values dependant on 
+        # pos and i
+        pe = torch.zeros(max_seq_len, d_model)
+        for pos in range(max_seq_len):
+            for i in range(0, d_model, 2):
+                pe[pos, i] = \
+                math.sin(pos / (10000 ** ((2 * i)/d_model)))
+                pe[pos, i + 1] = \
+                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+ 
+    
+    def forward(self, x):
+        # make embeddings relatively larger
+        x = x * math.sqrt(self.d_model)
+        #add constant to embedding
+        seq_len = x.size(1)
+        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
+        if x.is_cuda:
+            pe.cuda()
+        x = x + pe
+        return self.dropout(x)
+
+def get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+class Encoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, src, mask):
+        x = self.embed(src)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, mask)
+        return self.norm(x)
+    
+class Decoder(nn.Module):
+    def __init__(self, vocab_size, d_model, N, heads, dropout):
+        super().__init__()
+        self.N = N
+        self.embed = Embedder(vocab_size, d_model)
+        self.pe = PositionalEncoder(d_model, dropout=dropout)
+        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
+        self.norm = Norm(d_model)
+    def forward(self, trg, e_outputs, src_mask, trg_mask):
+        x = self.embed(trg)
+        x = self.pe(x)
+        for i in range(self.N):
+            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
+        return self.norm(x)
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT])
+        self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]))
+        # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        # self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        # self.block_size = config[varables.SIZE_BLOCK]
+        # self.apply(self._init_weights)
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+    def forward(self, src, trg, trg_out, boundary=None):
+        src_mask = None
+        trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device)
+        e_outputs = self.encoder(src, src_mask)
+        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
+        logits = self.out(d_output)
+        loss = None
+        if trg_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/UTFMG2/model copy.py b/SCMG/models/UTFMG2/model copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c
--- /dev/null
+++ b/SCMG/models/UTFMG2/model copy.py	
@@ -0,0 +1,187 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+logger = logging.getLogger(__name__)
+from SCMG.config import varables
+
+# class ModelConfig():
+#     rate_dropout_embedding = 0.1
+#     rate_dropout_residue = 0.1
+#     rate_dropout_attention = 0.1
+#     block_size=125
+#     def __init__(self, size_vocab, **kwargs):
+#         self.size_vocab = size_vocab
+#         for k,v in kwargs.items():
+#             setattr(self, k, v)
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                     .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+
+    def forward(self, x, layer_past=None):
+        B, T, C = x.size()
+        k =   self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+class CrossAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.n_head = config[varables.NUM_HEADS]
+        self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head
+        self.attention_features = config[varables.DIM_ATTENTION]
+        self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+                                .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))
+
+    def forward(self, x_encoder,x_decoder, layer_past=None):
+        B_encoder, T_encoder, C_encoder = x_encoder.size()
+        B_decoder, T_decoder, C_decoder = x_decoder.size()
+        k = self.key(  x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2)
+        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+        att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf'))
+        att = F.softmax(att, dim=-1)
+        att = self.dropout_attention(att)
+        y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features)
+        y = self.dropout_residue(self.projection(y))
+        return y
+
+
+
+
+class EncoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.attn = CausalSelfAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x):
+         # = y_input
+        x = x + self.attn(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.masked_attn = CausalSelfAttention(config)
+        self.cross_attn = CrossAttention(config)
+        self.mlp = nn.Sequential(
+            nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]),
+            nn.GELU(),
+            nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]),
+            nn.Dropout(config[varables.RATE_DROPOUT]),
+        )
+
+    def forward(self, x_encoder,x):
+         # = y_input
+        x = x + self.masked_attn(self.ln1(x))
+        x = x + self.cross_attn(x_encoder,self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING])
+        self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING]))
+        self.drop = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False)
+        self.block_size = config[varables.SIZE_BLOCK]
+        self.apply(self._init_weights)
+        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+        self.optimizer = None
+
+    def get_block_size(self):
+        return self.block_size
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab):
+        def collate(results):
+            x_in = [a[0] for a in results]
+            y_in = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in x_in])
+            max_len_y = max([len(a) for a in y_in])
+            x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long)
+            y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def forward(self, x_in, y_in, y_out=None,boundary=None):
+        x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :])
+        y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :])        
+        #
+        for encoder_block in self.encoder_blocks:
+            x_in = encoder_block(x_in)
+        x_in = self.ln_f(x_in)
+        for decoder_block in self.decoder_blocks:
+            y_in = decoder_block(x_in,y_in)
+        y_in = self.ln_f(y_in)        
+        logits = self.head(y_in)
+        loss = None
+        if y_out is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1))
+        return logits, loss
+
+# mark test
\ No newline at end of file
diff --git a/SCMG/models/UTFMG2/model.py b/SCMG/models/UTFMG2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..006fcf5566e614bbb5cecd32f74dd16246b85320
--- /dev/null
+++ b/SCMG/models/UTFMG2/model.py
@@ -0,0 +1,247 @@
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+# logger = logging.getLogger(__name__)
+from SCMG.config import varables
+from torch.autograd import Variable
+
+# class PositionalEncoder(nn.Module):
+#     def __init__(self, config):
+#         super().__init__()
+#         pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION])
+#         for pos in range(config[varables.SIZE_BLOCK]):
+#             for i in range(0, config[varables.DIM_ATTENTION], 2):
+#                 pe[pos, i] = \
+#                 math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION])))
+#                 pe[pos, i + 1] = \
+#                 math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION])))                
+#         pe = pe.unsqueeze(0)
+#         self.register_buffer('pe', pe)
+#     def forward(self, T):
+#         #add constant to embedding
+#         x = Variable(self.pe[:,:T], requires_grad=False)
+#         return x
+
+
+
+class PositionalEncoder(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoder, self).__init__()
+        self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT])
+        max_len = config[varables.SIZE_BLOCK]
+        pe = torch.zeros(max_len, config[varables.DIM_ATTENTION])
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION]))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, T):
+        x = self.Dropout(self.pe[:,:T, :])
+        return x
+
+
+
+class Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0
+        self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION])
+        self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING])
+        self.NumberOfHeads = config[varables.NUM_HEADS]
+        self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads
+        self.DimAttention = config[varables.DIM_ATTENTION]
+
+    def forward(self, X_1,X_2, mask=None):
+        if X_2 is None:
+            X_2 = X_1
+        BatchSize, T_Encoder, _ = X_1.size()
+        BatchSize, T_Decoder, _ = X_2.size()
+        K = self.Key(  X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2)
+        # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16
+        ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead)
+        ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9)
+        ScoreAttention = F.softmax(ScoreAttention, dim=-1)
+        ScoreAttention = self.Dropout_Attention(ScoreAttention)
+        # k.transpose(-2,-1): 3,4,16,5
+        # (q@(k.transpose(-2,-1))): 3,4,5,5
+        Z = ScoreAttention @ V
+        # y dimension: 3,4,5,16
+        Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention)
+        # y dimension: 3,5,64
+        Z = self.Dropout_Residue(self.Projection(Z))
+        return Z
+
+
+
+
+
+
+
+
+
+
+class FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config[varables.DIM_FEEDFORWARD] == 0:
+            Dim_FeedForward = config[varables.DIM_ATTENTION] *4
+        else:
+            Dim_FeedForward = config[varables.DIM_FEEDFORWARD]
+        self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward)
+        self.GELU = nn.GELU()
+        self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING])
+        self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT])
+
+    def forward(self,x):
+        x = self.Linear1(x)
+        x = self.GELU   (x)
+        x = self.Dropout(x)
+        x = self.Linear2(x)
+        return x
+
+class DecoderBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm1      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm3      = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.AttentionMasked = Attention(  config)
+        self.AttentionCross  = Attention(  config)
+        self.FeedForward     = FeedForward(config)
+
+    def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder):
+        X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None,                       Mask_Decoder))
+        X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross (                X_Encoder,  self.LayerNorm2(X_Decoder), Mask_Cross  ))
+        X_Decoder = self.Dropout3(X_Decoder + self.FeedForward    (self.LayerNorm3(X_Decoder)                                          ))
+        return X_Decoder
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Varables
+        self.Dim_Attention = config[varables.DIM_ATTENTION]
+        self.Token_Padding_Encoder = config["Token_Padding_Encoder"]
+        self.Token_Padding_Decoder = config["Token_Padding_Decoder"]
+        # Embedding and positional encoding layers
+        self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION])
+        self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION])
+        self.pos_emb = PositionalEncoder(config)
+        # Dropout and normalization layers
+        self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT])
+        self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING])
+        # Transformer layers
+        self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])])
+        # Output layer
+        self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False)
+        # Init
+        self.apply(self._init_weights)
+        self.optimizer = None
+        # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))
+
+    def _init_weights(self, module):
+        for p in module.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        # if isinstance(module, (nn.Linear, nn.Embedding)):
+        #     module.weight.data.normal_(mean=0.0, std=0.02)
+        #     if isinstance(module, nn.Linear) and module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.LayerNorm):
+        #     module.bias.data.zero_()
+        #     module.weight.data.fill_(1.0)
+    def init_optimizers(self,train_config):
+        optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING])
+        return optimizer
+    def init_scheduler(self,train_config):
+        scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA])
+        return scheduler
+    def get_collate_fn(self, vocab_encoder,vocab_decoder):
+        def collate(results):
+            X_Encoder = [a[0] for a in results]
+            X_Decoder = [a[1] for a in results]
+            boundary = -1
+            max_len_x = max([len(a) for a in X_Encoder])
+            max_len_y = max([len(a) for a in X_Decoder])
+            x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long)
+            y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long)
+            return x,y,boundary
+        return collate
+
+    def generate_masks(self,X_Encoder, X_Decoder):
+        # Generate encoder, decoder, cross masks
+        BatchSize, T_Encoder, _ = X_Encoder.size()
+        BatchSize, T_Decoder, _ = X_Decoder.size()
+        X = torch.cat([X_Encoder,torch.tensor([self.Token_Sep_Encoder],device=X_Encoder.device).unsqueeze(0).repeat(BatchSize,1),X_Decoder],axis=1)
+        CutIndex=T_Encoder+1
+        # T = X_Decoder.shape[1]
+        Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1)
+        Mask_Cross   = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2)
+        mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device)
+        Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+        return Mask_Encoder,Mask_Decoder,Mask_Cross
+
+    def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None):
+        Mask_Decoder,Mask_UTFMG,CutIndex = self.generate_masks(X_Encoder, X_Decoder)
+        # preprocess
+        X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1)))
+        #### Now X_Encoder: BatchSize, SequenceLength, DimAttention         
+        # Decoder blocks
+        for decoder_block in self.decoder_blocks:
+            X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_UTFMG)
+        X_Decoder = self.LayerNorm2(X_Decoder)
+        Y_Decoder_Logits = self.head(X_Decoder[:,CutIndex:])
+        loss = None
+        if Y_Decoder_Ref is not None:
+            loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder)
+        return Y_Decoder_Logits, loss
+
+    # def generate_masks(self,X_Encoder, X_Decoder):
+    #     # Generate encoder, decoder, cross masks
+    #     Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu()
+    #     Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu()
+    #     Mask_Cross   = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2)
+    #     Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2)
+    #     T = X_Decoder.shape[1]
+    #     mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T)
+    #     Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0)
+    #     Mask_Encoder = Mask_Encoder.to(X_Encoder.device)
+    #     Mask_Decoder = Mask_Decoder.to(X_Decoder.device)
+    #     Mask_Cross = Mask_Cross.to(X_Encoder.device)
+    #     return Mask_Encoder,Mask_Decoder,Mask_Cross
diff --git a/SCMG/models/UTFMG2/sampler.py b/SCMG/models/UTFMG2/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65
--- /dev/null
+++ b/SCMG/models/UTFMG2/sampler.py
@@ -0,0 +1,85 @@
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+def top_k_logits(logits, k):
+    v, ix = torch.topk(logits, k)
+    out = logits.clone()
+    out[out < v[:, [-1]]] = -float('Inf')
+    return out
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0, sample=False, top_k=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond)
+        logits = logits[:, -1, :] / temperature
+        if top_k is not None:
+            logits = top_k_logits(logits, top_k)
+        probs = F.softmax(logits, dim=-1)
+        if sample:
+            ix = torch.multinomial(probs, num_samples=1)
+        else:
+            _, ix = torch.topk(probs, k=1, dim=-1)
+        x = torch.cat((x, ix), dim=1)
+
+    return x
+
+
+
+
+@torch.no_grad()
+def sample(model, x, steps, temperature=1.0,boundary=None):
+    block_size = model.get_block_size()
+    model.eval()
+    for k in range(steps):
+        x_cond = x if x.size(1) <= block_size else x[:, -block_size:]
+        logits, _ = model(x_cond,boundary=boundary)
+        logits = logits[:, -1, :] / temperature
+        probs = F.softmax(logits, dim=-1)
+        ix = torch.multinomial(probs, num_samples=1)
+        x = torch.cat((x, ix), dim=1)
+    return x
+
+'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1'
+
+# for i in range(1,21):
+def sample_L(i,option='string'):
+    # i=2
+    prefix = 'L_'+str(i)
+    string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1'
+    array_input = [vocab[a] for a in ['<bos>'] + list(string_input)]
+    boundary = [len(array_input)]
+    tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1)
+    boundary = boundary*32
+    tensor_output = sample(model,tensor_input,250,boundary=boundary)
+    strings_output = []
+    for j in range(tensor_output.shape[0]):
+        list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['<pad>']]
+        # if list_string_output[0] == '<bos>':
+        #     list_string_output = list_string_output[1:]
+        if list_string_output[-1] == '<eos>':
+            list_string_output = list_string_output[:-1]
+        string_output = ''.join(list_string_output)
+        strings_output.append(string_output)
+        print(string_output)
+    for j in range(tensor_output.shape[0]):
+        if test_valid(strings_output[j]):
+            print(1)
+        else:
+            print(0)
+
+    # logits,_ = model(tensor_input,boundary=boundary)
+
+
+['<bos>', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', '<eos>']
diff --git a/SCMG/models/__init__.py b/SCMG/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3fd6af252368baf6ab44bee3baea027b107ff7
--- /dev/null
+++ b/SCMG/models/__init__.py
@@ -0,0 +1,2 @@
+from .Transformer import *
+
diff --git a/SCMG/models/__pycache__/__init__.cpython-310.pyc b/SCMG/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d314d7990bd847233d56c33a66a7972a9f836acc
Binary files /dev/null and b/SCMG/models/__pycache__/__init__.cpython-310.pyc differ