diff --git a/SCMG/__pycache__/_version.cpython-310.pyc b/SCMG/__pycache__/_version.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd53d1cdbb86b3f20d7da45154c1be8b81a09edc Binary files /dev/null and b/SCMG/__pycache__/_version.cpython-310.pyc differ diff --git a/SCMG/_version.py b/SCMG/_version.py new file mode 100644 index 0000000000000000000000000000000000000000..59a04da4d4c3180edab3d852e5d834c5169849da --- /dev/null +++ b/SCMG/_version.py @@ -0,0 +1,2 @@ +def get_versions(): + version = "0.1.1" diff --git a/SCMG/config/__init__.py b/SCMG/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/config/__pycache__/__init__.cpython-310.pyc b/SCMG/config/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..599d93e1d892f0000e63bc183b8371cde3beb7bb Binary files /dev/null and b/SCMG/config/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/config/__pycache__/modelparameters.cpython-310.pyc b/SCMG/config/__pycache__/modelparameters.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48a5fcd77f31050c7491ccecc6fc15f26d47b2e8 Binary files /dev/null and b/SCMG/config/__pycache__/modelparameters.cpython-310.pyc differ diff --git a/SCMG/config/__pycache__/varables.cpython-310.pyc b/SCMG/config/__pycache__/varables.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9924e92650a1390128fea2c726e4a0e5192a5689 Binary files /dev/null and b/SCMG/config/__pycache__/varables.cpython-310.pyc differ diff --git a/SCMG/config/modelparameters.py b/SCMG/config/modelparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..1870a10a1615761eeb9d84bdad5dccfb2e2eac92 --- /dev/null +++ b/SCMG/config/modelparameters.py @@ -0,0 +1,21 @@ +# class ModelParameters(): +# def __init__(self): +# self.NUM_LAYERS = "num_layers" +# self.NUM_HEADS = "num_heads" +# self.DIM_ATTENTION = "dim_attention" +# self.DIM_FEEDFORWARD = "dim_feedforward" +# self.DIM_LSTM = "dim_lstm" +# self.DIM_EMBEDDING = "dim_embedding" +# self.DIM_OUTPUT = "dim_output" +# self.RATE_DROPOUT = "rate_dropout" +# return +# + +NUM_LAYERS = "num_layers" +NUM_HEADS = "num_heads" +DIM_ATTENTION = "dim_attention" +DIM_FEEDFORWARD = "dim_feedforward" +DIM_LSTM = "dim_lstm" +DIM_EMBEDDING = "dim_embedding" +DIM_OUTPUT = "dim_output" +RATE_DROPOUT = "rate_dropout" diff --git a/SCMG/config/varables.py b/SCMG/config/varables.py new file mode 100644 index 0000000000000000000000000000000000000000..bc89401a9fc007b148cf6c64e2d0191c2ff02ba9 --- /dev/null +++ b/SCMG/config/varables.py @@ -0,0 +1,234 @@ +import re +from rdkit import Chem + +DEFAULT = "default" +AUTO = "auto" + +# Variables +COLUMN_SMILES = "SMILES" +COLUMN_ENCODER = "Encoder" +COLUMN_DECODER = "Decoder" +COLUMN_TASK_TYPE = "TaskType" +COLUMN_ENCODER_SEQUENCE = "EncoderSequence" +COLUMN_DECODER_SEQUENCE = "DecoderSequence" +COLUMN_BOS_TOKEN = "TokenBOS" +COLUMN_CUTS = "Cuts" +COLUMN_MIN_TOP_P = "MinTopP" +COLUMN_MIN_TOKEN_PROB = "MinTokenProb" +COLUMN_TOKEN_EOS_PROB = "TokenEOSProb" +COLUMN_MOLNAME = "MolName" +COLUMN_MOLINDEX = "MolIndex" +COLUMN_MOL_PROB = "MolProb" +COLUMN_MOL_PROB_TOPP = "MolProb_TopP" + +# Task +TOKEN_BEGIN = "" +TOKEN_END = "" +TOKEN_SEP = "" +TOKEN_CODER_SEP = "" +# TRAIN = "Train" +TOKEN_PAD = "" +COLUMN_EXCLUDED_MIN = "ExcludedSize" +COLUMN_SIZE_ToRunForNExt = "ExcludedSize" +COLUMN_SIZE_EXCLUDED = "ExcludedSize" + +# char_level_molecule_generation +COLUMN_task_char_mg = "char_mg" +TOKEN_TASK_CHAR_MG = "" + +# char_level_scaffold_constrained_molecule_generation +COLUMN_task_char_scmg = "char_scmg" +TOKEN_TASK_SCMG_CHAR_RAND = "" +TOKEN_TASK_SCMG_CHAR_CANO = "" +TOKEN_TASK_DG_CHAR_RAND = "" +TOKEN_TASK_DG_CHAR_CANO = "" +LIST_HEAVY_ATOMS = ['c', 'C', 'O', 'N', 'n', 'F', '[C@H]', 'Cl', '[C@@H]', 'S', '[nH]', 's', 'o', 'Br', '[C@]', '[C@@]', 'P', 'B', '[N+]', '[P@@]', '[P@]', '[S@@]', '[N@+]', '[S@]', '[N@@+]', '[N-]', 'p'] +COLUMN_EXCLUDE_REASON = "Excluded" +COLUMN_STATE = "State" +# chemical_property_prediction +COLUMN_task_chem_pd = "chem_pd" +TOKEN_TASK_CHEM_PD = "" + +# molecule_identification +COLUMN_task_mol_id = "mol_id" +TOKEN_TASK_MOL_ID = "" + + + +FILEPATH_MODEL = "filepath_model" +FILEPATH_INPUT = "filepath_input" +DIRPATH_OUTPUT = "dirpath_output" +RANDOM_AUGUMENT = "random_augument" +TOP_P = "top_p" +TOP_K = "top_k" +MIN_MOL_PROB = "minimum_mol_prob" +MIN_TOKEN_PROB = "minimum_token_prob" +MAX_HEAVY_ATOMS = "maximum_heavy_atoms" +TEMPERATURE = "temperature" + +# Data +VOCAB = "vocab" +SIZE_VOCAB = "size_vocab" +FILENAME_VOCAB = "vocab.pt" +FILENAME_VOCABSTATE = "vocabstate.pt" +FILENAME_DATA_RAW = "data.csv" + +TRAIN = "train" +TEST = "test" +FILENAME_TRAIN_RAW = "train.pt" +FILENAME_TRAIN_EPOCH = lambda x: "train_"+str(x)+".pt" + +FILENAME_TEST = "test.pt" +FILENAME_TEST_RAW = "test.pt" +FILENAME_TEST_EPOCH = lambda x: "test_"+str(x)+".pt" +FILEPATH_VOCAB = "filepath_vocab" +# +# try: +# config.screen_width = os.get_terminal_size()[0] +# except: +# config.screen_width = 141 +MAX_SEQUENCE_LENGTH = "max_sequence_length" +COLUMN_INCHIKEY = "InchiKey" +# Train +MODEL_NAME = "model_name" +MODEL_TYPE = "model_type" +MODEL = "model" +TASKS = "tasks" +DIRPATH_CHECKPOINT = "dirpath_checkpoint" +DIRPATH_DATA = "dirpath_data" +SIZE_BATCH = "size_batch" +SIZE_BLOCK = "size_block" +RATE_LEARNING = "rate_learning" +DEVICE = "device" +EPOCH = "epoch" +EPOCHS = "epochs" +NUM_WORKERS = "num_workers" +DIRPATH_COMPLETED = "dirpath_completed" +DIRPATH_EXCLUDED = "dirpath_excluded" +DIRPATH_SBATCH = "dirpath_sbatch" + +# Stats +TRAIN_LOSS = "train_loss" +TEST_LOSS = "test_loss" +TIME_ELAPSED = "time_elapsed" +RATE_LEARNING = "rate_learning" +TOKENS = "tokens" + +# Model +FILENAME_MODEL_INIT = "model_init.pt" +FILENAME_MODEL_LATEST = "model.pt" +FILENAME_MODEL_TRAINED = lambda x: "model_"+str(x)+".pt" + +FILENAME_MODELSTATE_INIT = "modelstate_init.pt" +FILENAME_MODELSTATE_LATEST = "modelstate.pt" +FILENAME_MODELSTATE_TRAINED = lambda x: "modelstate_"+str(x)+".pt" + +FILENAME_SCHEDULER_INIT = "scheduler_init.pt" +FILENAME_SCHEDULER_LATEST = "scheduler.pt" +FILENAME_SCHEDULER_TRAINED = lambda x: "scheduler_"+str(x)+".pt" + +FILENAME_OPTIMIZER_INIT = "optimizer_init.pt" +FILENAME_OPTIMIZER_LATEST = "optimizer.pt" +FILENAME_OPTIMIZER_TRAINED = lambda x: "optimizer_"+str(x)+".pt" + +# FILENAME_TRAINLOG_INIT = "train_init.pt" +FILENAME_TRAINSTATS_LATEST = "trainstats_latest.csv" +FILENAME_TRAINSTATS_TRAINED = lambda x: "trainstats_"+str(x)+".csv" + +FILENAME_TRAINLOG = "train" +FORMAT_TIMESTAMP_FILEHANDLER = "%Y%m%d%H%M%S_%f.log" +FORMAT_TIMESTAMP = "%Y/%m/%d %H:%M:%S %f" + +FORMAT_LOG = "" +DRY_RUN = "dry_run" +LOG_LEVEL = "log_level" +TOKENIZER = "tokenizer" +RUN_ONE_EPOCH = "run_one_epoch" +# # Column names +# IS_NOVEL = "IS_NOVAL" +# NOVALTY = "Novalty" +# # VALIDITY = "Validity" +# IS_VALID = "IS_VALID" +# IS_NOVAL = "IS_NOVAL" +# DIR_SAVE = "dir_save" +# MODEL_LATEST = "model.pt" +# LOG_TRAIN_LATEST = "train_log.csv" +# OPTIMIZER_LATEST = "optimizer.pt" +# SCHEDULER_LATEST = "scheduler.pt" +# TRAIN_LOSS = "train_loss" +# TEST_LOSS = "test_loss" +# TIME_ELAPSED = "time_elapsed" +# # LR = "lr" +# TOKENS = "tokens" + +LOGP = "logP" +WEIGHT = "weight" +QED = "QED" +VALIDITY = "SMILES_VALID" +FILENAME_TRAIN_DIST = "train_dist.pt" +FILENAME_TEST_DIST = "test_dist.pt" +MODEL_PRETRAIN = "model_pretrained.pt" + +PYFILE_SAMPLER = "sampler.py" +PYFILE_TRAINER = "trainer.py" +PYFILE_DATALOADER = "dataloader.py" +# PYFILE_SAMPLER = "sampler.py" + + + + +# Model parameters +NUM_LAYERS = "num_layers" +NUM_ENCODER_LAYERS = "num_encoder_layers" +NUM_DECODER_LAYERS = "num_decoder_layers" +NUM_HEADS = "num_heads" +DIM_ATTENTION = "dim_attention" +DIM_FEEDFORWARD = "dim_feedforward" +DIM_LSTM = "dim_lstm" +DIM_EMBEDDING = "dim_embedding" +DIM_OUTPUT = "dim_output" +RATE_DROPOUT = "rate_dropout" + + + + +#Scheduler +SIZE_STEP = "size_step" +GAMMA = "gamma" + + + + + + + + +# From Reinvent-Scaffold-Decorator +ATTACHMENT_POINT_TOKEN = "*" +ATTACHMENT_POINT_NUM_REGEXP = r"\[{}:(\d+)\]".format(re.escape(ATTACHMENT_POINT_TOKEN)) +ATTACHMENT_POINT_REGEXP = r"(?:{0}|\[{0}[^\]]*\])".format(re.escape(ATTACHMENT_POINT_TOKEN)) +ATTACHMENT_POINT_NO_BRACKETS_REGEXP = r"(? 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Decoder + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Decoder = self.generate_masks(X_Decoder) + # preprocess + X_Decoder = self.Dropout1(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Decoder,Mask_Decoder) + X_Decoder = self.LayerNorm1(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss \ No newline at end of file diff --git a/SCMG/models/GPT/sampler.py b/SCMG/models/GPT/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/GPT/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/GPT2/__init__.py b/SCMG/models/GPT2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/GPT2/__pycache__/__init__.cpython-310.pyc b/SCMG/models/GPT2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5e8f67f4fd9d403ef89903b3eaf33ff6f5a62ad Binary files /dev/null and b/SCMG/models/GPT2/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/GPT2/__pycache__/model.cpython-310.pyc b/SCMG/models/GPT2/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b2351be172e79b65b1d42bdd047478a6b150e81 Binary files /dev/null and b/SCMG/models/GPT2/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/GPT2/__pycache__/sampler.cpython-310.pyc b/SCMG/models/GPT2/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c44a897fb3271ec4e35a8882d260094352c94441 Binary files /dev/null and b/SCMG/models/GPT2/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/GPT2/model.py b/SCMG/models/GPT2/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b1822fbd0fd2f6e0286723b95567d1630bc5f2 --- /dev/null +++ b/SCMG/models/GPT2/model.py @@ -0,0 +1,197 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Decoder,Mask_Decoder): + X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.Dropout2(X_Decoder + self.FeedForward (self.LayerNorm2(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_DECODER_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Decoder + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Decoder = self.generate_masks(X_Decoder) + # preprocess + X_Decoder = self.Dropout1(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Decoder,Mask_Decoder) + X_Decoder = self.LayerNorm1(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss \ No newline at end of file diff --git a/SCMG/models/GPT2/sampler.py b/SCMG/models/GPT2/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/GPT2/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/LSTM/__init__.py b/SCMG/models/LSTM/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/LSTM/__pycache__/__init__.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a80648fac63e5677b96d14154c9254b403facf09 Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/LSTM/__pycache__/model.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b04314d86110100a1cf6f3b6305bfa3ac129ea4 Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/LSTM/__pycache__/sampler.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d989a0b8bf05a57af3c6b8aa83b8261fc7996fd Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/LSTM/__pycache__/trainer.cpython-310.pyc b/SCMG/models/LSTM/__pycache__/trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ce864c1d5f0af498e375c5f5f29ebd674896f5e Binary files /dev/null and b/SCMG/models/LSTM/__pycache__/trainer.cpython-310.pyc differ diff --git a/SCMG/models/LSTM/model.py b/SCMG/models/LSTM/model.py new file mode 100644 index 0000000000000000000000000000000000000000..59f0e44f03505d9bb05380a3e0014ba13e3b9f22 --- /dev/null +++ b/SCMG/models/LSTM/model.py @@ -0,0 +1,48 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.utils.rnn as rnn_utils +from SCMG.config import varables + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.vocab = config["vocab_encoder"] + # self.vocabulary = vocabulary + # self.hidden_size = config.hidden + # self.num_layers = config.num_layers + # self.dropout = config.dropout + # self.vocab_size = self.input_size = self.output_size = len(vocabulary) + self.embedding_layer = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.lstm_layer = nn.LSTM(config[varables.DIM_EMBEDDING], config[varables.DIM_LSTM], + config[varables.NUM_LAYERS], dropout=config[varables.RATE_DROPOUT], + batch_first=True) + self.linear_layer = nn.Linear(config[varables.DIM_LSTM], len(config["vocab_encoder"])) + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = None + y_in = [a[0] + [vocab_encoder[varables.TOKEN_SEP]] + a[1] for a in results] + # boundary = [a[2] for a in results] + max_len = max([len(a) for a in y_in]) + y = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len-len(a))) for a in y_in],dtype=torch.long) + return x_in,y,0 + return collate + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def forward(self, src, trg, trg_out, boundary=None): + # x = ([src , torch.tensor([self.vocab[""]]*x.size[0]).unsqueeze(1).to(x.device), trg],dim=1) + hiddens=None + x = self.embedding_layer(trg) + # x = rnn_utils.pack_padded_sequence(x, lengths, batch_first=True) + self.lstm_layer.flatten_parameters() + x, hiddens = self.lstm_layer(x, hiddens) + # x, _ = rnn_utils.pad_packed_sequence(x, batch_first=True) + logits = self.linear_layer(x) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss diff --git a/SCMG/models/LSTM/sampler.py b/SCMG/models/LSTM/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..40ab9635c922fd242699930d44a623161f732ac8 --- /dev/null +++ b/SCMG/models/LSTM/sampler.py @@ -0,0 +1,20 @@ +from MoleculeProcessing.utils.utils import * +from MoleculeProcessing.utils.utils_sample import * +import torch.nn.functional as F + +def sample(model,vocab_bos,size_batch=32,size_block=70,temperature=1.,): + model,device = load_to_device(model) + model.eval() + with torch.no_grad(): + tensor_sampled = torch.zeros(size_batch,size_block+1,dtype=torch.long,device=device) + tensor_sampled[:,0] = vocab_bos + hiddens = None + for i in range(size_block): + input_current = tensor_sampled[:,[i]] + probs,hiddens = model.forward(input_current,hiddens) + probs = probs[:,-1] + probs = probs * temperature + probs = F.softmax(probs,dim=-1) + sample = torch.distributions.categorical.Categorical(probs).sample() + tensor_sampled[:,i+1] = sample + return tensor_sampled diff --git a/SCMG/models/LSTM/trainer.py b/SCMG/models/LSTM/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..969b615c7d42c01d2e13ff3e7ab9e17e7b953daf --- /dev/null +++ b/SCMG/models/LSTM/trainer.py @@ -0,0 +1,195 @@ +import math +import logging +import time +from tqdm import tqdm +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.optim.lr_scheduler import LambdaLR +from torch.utils.data.dataloader import DataLoader +from MoleculeProcessing.utils.utils_train import * +logger = logging.getLogger(__name__) +from MoleculeProcessing.utils.utils import * +from MoleculeProcessing.utils.utils_train import * +from MoleculeProcessing.config.config import * + +class TrainerConfig: + learning_rate = 3e-4 + betas = (0.9, 0.95) + grad_norm_clip = 1.0 + weight_decay = 0.1 + lr_decay = False + warmup_tokens = 375e6 + final_tokens = 260e9 + ckpt_path = None + num_workers = 0 + config = None + epoch = 0 + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +class Trainer: + def __init__(self, model, train_dataset, test_dataset, config): + self.model = model + self.train_dataset = train_dataset + self.test_dataset = test_dataset + self.config = config + # continue train if previous model exists + self.train_log = init_train_log() + if os.path.exists(os.path.join(self.config.config.path_checkpoint,LOG_TRAIN_LATEST)): + self.train_log = pd.read_csv(os.path.join(self.config.config.path_checkpoint,LOG_TRAIN_LATEST)) + self.config.epoch = self.train_log.shape[0] + if self.train_log.shape[0]>0: + self.model = load_model( self.config.config.path_checkpoint,self.config.epoch-1) + self.optimizer = load_optimizer(self.config.config.path_checkpoint,self.config.epoch-1) + self.tokens = self.train_log.loc[self.config.epoch-1,TOKENS] + self.scheduler = load_scheduler(self.config.config.path_checkpoint,self.config.epoch-1) + else: + self.tokens = 0 # counter used for learning rate decay + self.optimizer = model.configure_optimizers(config) + self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, + 10, + 0.5) + self.criterion = nn.CrossEntropyLoss() + # take over whatever gpus are on the system + self.device = 'cpu' + if torch.cuda.is_available(): + self.device = torch.cuda.current_device() + self.model = torch.nn.DataParallel(self.model).to(self.device) + + def save_checkpoint(self): + path_checkpoint = self.config.config.path_checkpoint + # DataParallel wrappers keep raw model object in .module attribute + raw_model = self.model.module if hasattr(self.model, "module") else self.model + logger.info("saving %s", path_checkpoint) + path_model_epoch = add_before_extension(os.path.join(path_checkpoint, + MODEL_LATEST), + str(self.config.epoch)) + torch.save(raw_model, path_model_epoch) + # optimizer + path_optimizer_epoch = \ + add_before_extension( + os.path.join( + path_checkpoint, + OPTIMIZER_LATEST + ), + str(self.config.epoch) + ) + torch.save( + self.optimizer, + path_optimizer_epoch + ) + # optimizer + path_scheduler_epoch = \ + add_before_extension( + os.path.join( + path_checkpoint, + SCHEDULER_LATEST + ), + str(self.config.epoch) + ) + torch.save( + self.scheduler, + path_scheduler_epoch + ) + # train log + self.train_log.to_csv( + os.path.join( + path_checkpoint, + LOG_TRAIN_LATEST + ) + ,index=False + ) + path_train_log_epoch = \ + add_before_extension( + os.path.join( + path_checkpoint, + LOG_TRAIN_LATEST + ), + str(self.config.epoch) + ) + self.train_log.to_csv( + path_train_log_epoch, + index=False) + + # torch.save(self.token,os.path.join(path_checkpoint,'tokens_'+self.config.epoch+'.pt')) + def train(self): + model, config = self.model, self.config + raw_model = model.module if hasattr(self.model, "module") else model + optimizer = self.optimizer + scheduler = self.scheduler + while self.config.epoch < config.config.epochs and self.config.epoch != config.config.epochs: + current_status = dict([[a,None] for a in self.train_log.columns]) + current_status[EPOCH] = self.config.epoch + time_start = time.time() + current_status = self.run_epoch('train',current_status) + current_status[TIME_ELAPSED] = int(time.time()-time_start) + current_status[TOKENS] = self.tokens + if self.test_dataset is not None: + current_status = self.run_epoch('test',current_status) + self.train_log.loc[self.config.epoch] = current_status + scheduler.step() + self.save_checkpoint() + self.config.epoch += 1 + + def run_epoch(self,split,current_status): + model = self.model + is_train = split == 'train' + model.train(is_train) + data = self.train_dataset if is_train else self.test_dataset + data.shuffle(random_state=self.config.epoch) + loader = DataLoader(data, shuffle=False, pin_memory=True, + batch_size=self.config.config.size_batch, + num_workers=self.config.num_workers) + + losses = [] + pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader) + for it, (x, y) in pbar: + + # place data on the correct device + x = x.to(self.device) + y = y.to(self.device) + + # forward the model + with torch.set_grad_enabled(is_train): + outputs,_ = model.forward(x) + loss = self.criterion(outputs.view(-1, outputs.shape[-1]), + y.view(-1)) + loss = loss.mean() # collapse all losses if they are scattered on multiple gpus + losses.append(loss.item()) + + if is_train: + # backprop and update the parameters + model.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.grad_norm_clip) + self.optimizer.step() + + # decay the learning rate based on our progress + if self.config.lr_decay: + self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100) + if self.tokens < self.config.warmup_tokens: + # linear warmup + lr_mult = float(self.tokens) / float(max(1, self.config.warmup_tokens)) + else: + # cosine learning rate decay + progress = float(self.tokens - self.config.warmup_tokens) / float(max(1, self.config.final_tokens - self.config.warmup_tokens)) + lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress))) + lr = self.config.learning_rate * lr_mult + for param_group in optimizer.param_groups: + param_group['lr'] = lr + else: + lr = self.config.learning_rate + current_status[LR] = lr + + # report progress + pbar.set_description(f"epoch {self.config.epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}") + current_status[split+'_loss'] = float(np.mean(losses)) + if not is_train: + test_loss = float(np.mean(losses)) + logger.info("test loss: %f", test_loss) + return current_status diff --git a/SCMG/models/Reinvent/__init__.py b/SCMG/models/Reinvent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Reinvent/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..483fae766088dcefd1a42be42466d42476c444a2 Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9f739287942911ec6b727feab0c6fe140f4389d Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14ced2341761ac673ecd104e084501789816f0b4 Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent/__pycache__/model.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42270dbd5e48f2d2ec2ddc1df7c3fdd8e7048ff8 Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Reinvent/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46c9961fcc26d8f0f92e368f3496b1f8319618a1 Binary files /dev/null and b/SCMG/models/Reinvent/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent/model copy 2.py b/SCMG/models/Reinvent/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Reinvent/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Reinvent/model copy.py b/SCMG/models/Reinvent/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Reinvent/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Reinvent/model.py b/SCMG/models/Reinvent/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d93c209eece94f1cbce12f9907ce3edf18017f6f --- /dev/null +++ b/SCMG/models/Reinvent/model.py @@ -0,0 +1,278 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +# class PositionalEncoder(nn.Module): +# def __init__(self, config): +# super().__init__() +# pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION]) +# for pos in range(config[varables.SIZE_BLOCK]): +# for i in range(0, config[varables.DIM_ATTENTION], 2): +# pe[pos, i] = \ +# math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION]))) +# pe[pos, i + 1] = \ +# math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION]))) +# pe = pe.unsqueeze(0) +# self.register_buffer('pe', pe) +# def forward(self, T): +# #add constant to embedding +# x = Variable(self.pe[:,:T], requires_grad=False) +# return x + + + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.Dropout1(X_Encoder + self.Attention (self.LayerNorm1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.Dropout2(X_Encoder + self.FeedForward(self.LayerNorm2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross ( X_Encoder, self.LayerNorm2(X_Decoder), Mask_Cross )) + X_Decoder = self.Dropout3(X_Decoder + self.FeedForward (self.LayerNorm3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + + + + + + + + + + + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Reinvent/sampler.py b/SCMG/models/Reinvent/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Reinvent/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__init__.py b/SCMG/models/Reinvent_Scaffold_Decorator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0cdc0c50cde1ad2794fe7ccb4470ee6560a2fff0 Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e507145c9134a603126084bf93df297b3de251f Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a1741ccf7288d663bb655dfe90a6a08ac942d8d Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d4e71a297c36745f63960a12870eed7f0cf8b1e Binary files /dev/null and b/SCMG/models/Reinvent_Scaffold_Decorator/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/model copy 2.py b/SCMG/models/Reinvent_Scaffold_Decorator/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Reinvent_Scaffold_Decorator/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/model copy.py b/SCMG/models/Reinvent_Scaffold_Decorator/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Reinvent_Scaffold_Decorator/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/model.py b/SCMG/models/Reinvent_Scaffold_Decorator/model.py new file mode 100644 index 0000000000000000000000000000000000000000..2efedb5615ab35061aa476a38704b4c5826e3cfe --- /dev/null +++ b/SCMG/models/Reinvent_Scaffold_Decorator/model.py @@ -0,0 +1,276 @@ + +Skip to content + + Why GitHub? + +Team +Enterprise +Explore +Marketplace +Pricing + +Sign in +Sign up +undeadpixel / +reinvent-scaffold-decorator +Public + +Code +Issues 3 +Pull requests +Actions +Projects +Wiki +Security + + Insights + +reinvent-scaffold-decorator/models/model.py / +Arús-Pous, Josep updated to revised version +Latest commit 37d0a8a on May 8, 2020 +History +0 contributors +136 lines (118 sloc) 5.75 KB +""" +Model class. +""" + +import torch +import torch.nn as tnn + +import models.decorator as mdec + + +class DecoratorModel: + + def __init__(self, vocabulary, decorator, max_sequence_length=256, no_cuda=False, mode="train"): + """ + Implements the likelihood and sampling functions of the decorator model. + :param vocabulary: A DecoratorVocabulary instance with the vocabularies of both the encoder and decoder. + :param network_params: A dict with parameters for the encoder and decoder networks. + :param decorator: An decorator network instance. + :param max_sequence_length: Maximium number of tokens allowed to sample. + :param no_cuda: Forces the model not to use CUDA, even if it is available. + :param mode: Mode in which the model should be initialized. + :return: + """ + self.vocabulary = vocabulary + self.max_sequence_length = max_sequence_length + self.network = decorator + + if torch.cuda.is_available() and not no_cuda: + self.network.cuda() + + self._nll_loss = tnn.NLLLoss(reduction="none", ignore_index=0) + self.set_mode(mode) + + @classmethod + def load_from_file(cls, path, mode="train"): + """ + Loads a model from a single file + :param path: Path to the saved model. + :param mode: Mode in which the model should be initialized. + :return: An instance of the RNN. + """ + data = torch.load(path) + + decorator = mdec.Decorator(**data["decorator"]["params"]) + decorator.load_state_dict(data["decorator"]["state"]) + + model = DecoratorModel( + decorator=decorator, + mode=mode, + **data["model"] + ) + + return model + + def save(self, path): + """ + Saves the model to a file. + :param path: Path to the file which the model will be saved to. + """ + save_dict = { + 'model': { + 'vocabulary': self.vocabulary, + 'max_sequence_length': self.max_sequence_length + }, + 'decorator': { + 'params': self.network.get_params(), + 'state': self.network.state_dict() + } + } + torch.save(save_dict, path) + + def set_mode(self, mode): + """ + Changes the mode of the RNN to training or eval. + :param mode: Mode to change to (training, eval) + :return: The model instance. + """ + if mode == "sampling" or mode == "eval": + self.network.eval() + else: + self.network.train() + return self + + def likelihood(self, scaffold_seqs, scaffold_seq_lengths, decoration_seqs, decoration_seq_lengths, with_attention_weights=False): + """ + Retrieves the likelihood of a scaffold and its respective decorations. + :param scaffold_seqs: (batch, seq) A batch of padded scaffold sequences. + :param scaffold_seq_lengths: The length of the scaffold sequences (for packing purposes). + :param decoration_seqs: (batch, seq) A batch of decorator sequences. + :param decoration_seq_lengths: The length of the decorator sequences (for packing purposes). + :return: (batch) Log likelihood for each item in the batch. + """ + + # NOTE: the decoration_seq_lengths have a - 1 to prevent the end token to be forward-passed. + logits, attention_weights = self.network(scaffold_seqs, scaffold_seq_lengths, decoration_seqs, + decoration_seq_lengths - 1) # (batch, seq - 1, voc) + log_probs = logits.log_softmax(dim=2).transpose(1, 2) # (batch, voc, seq - 1) + + logits = self._nll_loss(log_probs, decoration_seqs[:, 1:]).sum(dim=1) # (batch) + if with_attention_weights: + return logits, attention_weights + else: + return logits + + @torch.no_grad() + def sample_decorations(self, scaffold_seqs, scaffold_seq_lengths): + """ + Samples as many decorations as scaffolds in the tensor. + :param scaffold_seqs: A tensor with the scaffolds to sample already encoded and padded. + :param scaffold_seq_lengths: A tensor with the length of the scaffolds. + :return: An iterator with (scaffold_smi, decoration_smi, nll) triplets. + """ + batch_size = scaffold_seqs.size(0) + input_vector = torch.full( + (batch_size, 1), self.vocabulary.decoration_vocabulary["^"], dtype=torch.long).cuda() # (batch, 1) + seq_lengths = torch.ones(batch_size) # (batch) + encoder_padded_seqs, hidden_states = self.network.forward_encoder(scaffold_seqs, scaffold_seq_lengths) + nlls = torch.zeros(batch_size).cuda() + not_finished = torch.ones(batch_size, 1, dtype=torch.long).cuda() + sequences = [] + for _ in range(self.max_sequence_length - 1): + logits, hidden_states, _ = self.network.forward_decoder( + input_vector, seq_lengths, encoder_padded_seqs, hidden_states) # (batch, 1, voc) + probs = logits.softmax(dim=2).squeeze() # (batch, voc) + log_probs = logits.log_softmax(dim=2).squeeze() # (batch, voc) + input_vector = torch.multinomial(probs, 1)*not_finished # (batch, 1) + sequences.append(input_vector) + nlls += self._nll_loss(log_probs, input_vector.squeeze()) + not_finished = (input_vector > 1).type(torch.long) # 0 is padding, 1 is end token + if not_finished.sum() == 0: + break + + decoration_smiles = [self.vocabulary.decode_decoration(seq) + for seq in torch.cat(sequences, 1).data.cpu().numpy()] + scaffold_smiles = [self.vocabulary.decode_scaffold(seq) for seq in scaffold_seqs.data.cpu().numpy()] + return zip(scaffold_smiles, decoration_smiles, nlls.data.cpu().numpy().tolist()) + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Reinvent_Scaffold_Decorator/sampler.py b/SCMG/models/Reinvent_Scaffold_Decorator/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Reinvent_Scaffold_Decorator/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer/__init__.py b/SCMG/models/Transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..87cb7367b08ea138065cb521a41cd746f06b57ef --- /dev/null +++ b/SCMG/models/Transformer/__init__.py @@ -0,0 +1 @@ +from .model import * diff --git a/SCMG/models/Transformer/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d2712f8eda4d9d04ec9a27c39f87615e7cdcd1b Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81652cccdc7c73478f43eb4e6156e47c3b89c8ab Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer/__pycache__/model copy 3.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model copy 3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bbe759cc798a5e956d173f2439ae0d5e75e6ef7 Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model copy 3.cpython-310.pyc differ diff --git a/SCMG/models/Transformer/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e9d890c989ba1479718238c89d8abc62083fa22 Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca7f600f46ec69862f287dab51151445c0d05a5b Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ed2bc6c3cbce2c0be07be58a51ec73edc35f213 Binary files /dev/null and b/SCMG/models/Transformer/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer/model copy 2.py b/SCMG/models/Transformer/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..3e2f3b32b2a6dcfeed7c7cb8a924e4ef420f9d8a --- /dev/null +++ b/SCMG/models/Transformer/model copy 2.py @@ -0,0 +1,175 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x_encoder,x_decoder, mask): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(mask == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = Attention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + def forward(self, x): + x = self.ln1(x + self.attn(x,x)) + x = self.ln2(x + self.mlp(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb_encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.tok_emb_decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb_encoder(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb_decoder(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer/model copy 3.py b/SCMG/models/Transformer/model copy 3.py new file mode 100644 index 0000000000000000000000000000000000000000..97c7c39ce3f736996efe25e2f470e6dfd4b76748 --- /dev/null +++ b/SCMG/models/Transformer/model copy 3.py @@ -0,0 +1,179 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln3(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb_encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.tok_emb_decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb_encoder(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb_decoder(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer/model copy.py b/SCMG/models/Transformer/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..e054937f34f6e76e5e04715e6e5c76e5b486ec3f --- /dev/null +++ b/SCMG/models/Transformer/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_encoder"]), bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer/model.py b/SCMG/models/Transformer/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e054937f34f6e76e5e04715e6e5c76e5b486ec3f --- /dev/null +++ b/SCMG/models/Transformer/model.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_encoder"]), bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer/sampler.py b/SCMG/models/Transformer/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_Test/__init__.py b/SCMG/models/Transformer_Test/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_Test/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_Test/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8db1dde5092caca6bf576f805bae30e4aada64f Binary files /dev/null and b/SCMG/models/Transformer_Test/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_Test/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_Test/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e70ddb28cec728f2fb9abfd23c38420855f31920 Binary files /dev/null and b/SCMG/models/Transformer_Test/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_Test/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_Test/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10af8d4ec5cd0aaac36d5fb759eb33c5c5de1f45 Binary files /dev/null and b/SCMG/models/Transformer_Test/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_Test/model.py b/SCMG/models/Transformer_Test/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c38354e9b334af027d3507ea8c11f48a76fcd207 --- /dev/null +++ b/SCMG/models/Transformer_Test/model.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key(torch.cat([x_encoder,x_decoder],dim=1)).view(B_encoder, (T_encoder+T_decoder), self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(torch.cat([x_encoder,x_decoder],dim=1)).view(B_decoder, (T_encoder+T_decoder), self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:(T_encoder+T_decoder)] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_Test/sampler.py b/SCMG/models/Transformer_Test/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_Test/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_Torch/__init__.py b/SCMG/models/Transformer_Torch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_Torch/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_Torch/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b22844cf91a2d3edccd43a41f6314018ea8c6dfe Binary files /dev/null and b/SCMG/models/Transformer_Torch/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_Torch/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_Torch/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2611e38b5ac471c7eb0304b2ba3d82087c838e53 Binary files /dev/null and b/SCMG/models/Transformer_Torch/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_Torch/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_Torch/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0490881eacb0a6ff919790ecbda9c0439dbdd85 Binary files /dev/null and b/SCMG/models/Transformer_Torch/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_Torch/model.py b/SCMG/models/Transformer_Torch/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5141298f1d2c558f1c37def377daaea42c2d551e --- /dev/null +++ b/SCMG/models/Transformer_Torch/model.py @@ -0,0 +1,895 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F +# from torch.nn import TransformerEncoder, TransformerEncoderLayer +# from torch.nn import TransformerDecoder, TransformerDecoderLayer +# from torch.nn import Transformer +logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.nn.init import constant_, xavier_normal_, xavier_uniform_ + +import numbers +import copy +from typing import Optional, Any, Union, Callable,Tuple,List + +import torch +from torch import Tensor +from torch.nn import functional as F +from torch.nn import Module +from torch.nn.modules.activation import MultiheadAttention +from torch.nn.modules.container import ModuleList +from torch.nn.modules.dropout import Dropout +from torch.nn.modules.linear import Linear +from torch.nn.modules.normalization import LayerNorm +from torch.nn.parameter import Parameter +from torch import Tensor, Size +from torch.nn import init + + +_shape_t = Union[int, List[int], Size] +class LayerNorm(Module): + r"""Applies Layer Normalization over a mini-batch of inputs as described in + the paper `Layer Normalization `__ + .. math:: + y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta + The mean and standard-deviation are calculated over the last `D` dimensions, where `D` + is the dimension of :attr:`normalized_shape`. For example, if :attr:`normalized_shape` + is ``(3, 5)`` (a 2-dimensional shape), the mean and standard-deviation are computed over + the last 2 dimensions of the input (i.e. ``input.mean((-2, -1))``). + :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of + :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``. + The standard-deviation is calculated via the biased estimator, equivalent to + `torch.var(input, unbiased=False)`. + .. note:: + Unlike Batch Normalization and Instance Normalization, which applies + scalar scale and bias for each entire channel/plane with the + :attr:`affine` option, Layer Normalization applies per-element scale and + bias with :attr:`elementwise_affine`. + This layer uses statistics computed from input data in both training and + evaluation modes. + Args: + normalized_shape (int or list or torch.Size): input shape from an expected input + of size + .. math:: + [* \times \text{normalized\_shape}[0] \times \text{normalized\_shape}[1] + \times \ldots \times \text{normalized\_shape}[-1]] + If a single integer is used, it is treated as a singleton list, and this module will + normalize over the last dimension which is expected to be of that specific size. + eps: a value added to the denominator for numerical stability. Default: 1e-5 + elementwise_affine: a boolean value that when set to ``True``, this module + has learnable per-element affine parameters initialized to ones (for weights) + and zeros (for biases). Default: ``True``. + Attributes: + weight: the learnable weights of the module of shape + :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``. + The values are initialized to 1. + bias: the learnable bias of the module of shape + :math:`\text{normalized\_shape}` when :attr:`elementwise_affine` is set to ``True``. + The values are initialized to 0. + Shape: + - Input: :math:`(N, *)` + - Output: :math:`(N, *)` (same shape as input) + Examples:: + >>> # NLP Example + >>> batch, sentence_length, embedding_dim = 20, 5, 10 + >>> embedding = torch.randn(batch, sentence_length, embedding_dim) + >>> layer_norm = nn.LayerNorm(embedding_dim) + >>> # Activate module + >>> layer_norm(embedding) + >>> + >>> # Image Example + >>> N, C, H, W = 20, 5, 10, 10 + >>> input = torch.randn(N, C, H, W) + >>> # Normalize over the last three dimensions (i.e. the channel and spatial dimensions) + >>> # as shown in the image below + >>> layer_norm = nn.LayerNorm([C, H, W]) + >>> output = layer_norm(input) + .. image:: ../_static/img/nn/layer_norm.jpg + :scale: 50 % + """ + __constants__ = ['normalized_shape', 'eps', 'elementwise_affine'] + normalized_shape: Tuple[int, ...] + eps: float + elementwise_affine: bool + + def __init__(self, normalized_shape: _shape_t, eps: float = 1e-5, elementwise_affine: bool = True, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super(LayerNorm, self).__init__() + if isinstance(normalized_shape, numbers.Integral): + # mypy error: incompatible types in assignment + normalized_shape = (normalized_shape,) # type: ignore[assignment] + self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type] + self.eps = eps + self.elementwise_affine = elementwise_affine + if self.elementwise_affine: + self.weight = Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + self.bias = Parameter(torch.empty(self.normalized_shape, **factory_kwargs)) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self) -> None: + if self.elementwise_affine: + init.ones_(self.weight) + init.zeros_(self.bias) + + def forward(self, input: Tensor) -> Tensor: + return F.layer_norm( + input, self.normalized_shape, self.weight, self.bias, self.eps) + + def extra_repr(self) -> str: + return '{normalized_shape}, eps={eps}, ' \ + 'elementwise_affine={elementwise_affine}'.format(**self.__dict__) + + +class Linear(Module): + r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b` + This module supports :ref:`TensorFloat32`. + Args: + in_features: size of each input sample + out_features: size of each output sample + bias: If set to ``False``, the layer will not learn an additive bias. + Default: ``True`` + Shape: + - Input: :math:`(*, H_{in})` where :math:`*` means any number of + dimensions including none and :math:`H_{in} = \text{in\_features}`. + - Output: :math:`(*, H_{out})` where all but the last dimension + are the same shape as the input and :math:`H_{out} = \text{out\_features}`. + Attributes: + weight: the learnable weights of the module of shape + :math:`(\text{out\_features}, \text{in\_features})`. The values are + initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where + :math:`k = \frac{1}{\text{in\_features}}` + bias: the learnable bias of the module of shape :math:`(\text{out\_features})`. + If :attr:`bias` is ``True``, the values are initialized from + :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where + :math:`k = \frac{1}{\text{in\_features}}` + Examples:: + >>> m = nn.Linear(20, 30) + >>> input = torch.randn(128, 20) + >>> output = m(input) + >>> print(output.size()) + torch.Size([128, 30]) + """ + __constants__ = ['in_features', 'out_features'] + in_features: int + out_features: int + weight: Tensor + + def __init__(self, in_features: int, out_features: int, bias: bool = True, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super(Linear, self).__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs)) + if bias: + self.bias = Parameter(torch.empty(out_features, **factory_kwargs)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + + def reset_parameters(self) -> None: + # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with + # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see + # https://github.com/pytorch/pytorch/issues/57109 + init.kaiming_uniform_(self.weight, a=math.sqrt(5)) + if self.bias is not None: + fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(self.bias, -bound, bound) + + def forward(self, input: Tensor) -> Tensor: + return F.linear(input, self.weight, self.bias) + + def extra_repr(self) -> str: + return 'in_features={}, out_features={}, bias={}'.format( + self.in_features, self.out_features, self.bias is not None + ) + +# This class exists solely to avoid triggering an obscure error when scripting +# an improperly quantized attention layer. See this issue for details: +# https://github.com/pytorch/pytorch/issues/58969 +# TODO: fail fast on quantization API usage error, then remove this class +# and replace uses of it with plain Linear +class NonDynamicallyQuantizableLinear(Linear): + def __init__(self, in_features: int, out_features: int, bias: bool = True, + device=None, dtype=None) -> None: + super().__init__(in_features, out_features, bias=bias, + device=device, dtype=dtype) + + +class MultiheadAttention(Module): + r"""Allows the model to jointly attend to information + from different representation subspaces. + See `Attention Is All You Need `_. + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`. + Args: + embed_dim: Total dimension of the model. + num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split + across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``). + dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout). + bias: If specified, adds bias to input / output projection layers. Default: ``True``. + add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``. + add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1. + Default: ``False``. + kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``). + vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + Examples:: + >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + >>> attn_output, attn_output_weights = multihead_attn(query, key, value) + """ + __constants__ = ['batch_first'] + bias_k: Optional[torch.Tensor] + bias_v: Optional[torch.Tensor] + + def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, + kdim=None, vdim=None, batch_first=False, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout = dropout + self.batch_first = batch_first + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + if self._qkv_same_embed_dim is False: + self.q_proj_weight = Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs)) + self.k_proj_weight = Parameter(torch.empty((embed_dim, self.kdim), **factory_kwargs)) + self.v_proj_weight = Parameter(torch.empty((embed_dim, self.vdim), **factory_kwargs)) + self.register_parameter('in_proj_weight', None) + else: + self.in_proj_weight = Parameter(torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)) + self.register_parameter('q_proj_weight', None) + self.register_parameter('k_proj_weight', None) + self.register_parameter('v_proj_weight', None) + + if bias: + self.in_proj_bias = Parameter(torch.empty(3 * embed_dim, **factory_kwargs)) + else: + self.register_parameter('in_proj_bias', None) + self.out_proj = NonDynamicallyQuantizableLinear(embed_dim, embed_dim, bias=bias, **factory_kwargs) + + if add_bias_kv: + self.bias_k = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + self.bias_v = Parameter(torch.empty((1, 1, embed_dim), **factory_kwargs)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self._reset_parameters() + + def _reset_parameters(self): + if self._qkv_same_embed_dim: + xavier_uniform_(self.in_proj_weight) + else: + xavier_uniform_(self.q_proj_weight) + xavier_uniform_(self.k_proj_weight) + xavier_uniform_(self.v_proj_weight) + + if self.in_proj_bias is not None: + constant_(self.in_proj_bias, 0.) + constant_(self.out_proj.bias, 0.) + if self.bias_k is not None: + xavier_normal_(self.bias_k) + if self.bias_v is not None: + xavier_normal_(self.bias_v) + + def __setstate__(self, state): + # Support loading old MultiheadAttention checkpoints generated by v1.1.0 + if '_qkv_same_embed_dim' not in state: + state['_qkv_same_embed_dim'] = True + + super(MultiheadAttention, self).__setstate__(state) + + def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]: + r""" + Args: + query: Query embeddings of shape :math:`(L, N, E_q)` when ``batch_first=False`` or :math:`(N, L, E_q)` + when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is the batch size, + and :math:`E_q` is the query embedding dimension ``embed_dim``. Queries are compared against + key-value pairs to produce the output. See "Attention Is All You Need" for more details. + key: Key embeddings of shape :math:`(S, N, E_k)` when ``batch_first=False`` or :math:`(N, S, E_k)` when + ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and + :math:`E_k` is the key embedding dimension ``kdim``. See "Attention Is All You Need" for more details. + value: Value embeddings of shape :math:`(S, N, E_v)` when ``batch_first=False`` or :math:`(N, S, E_v)` when + ``batch_first=True``, where :math:`S` is the source sequence length, :math:`N` is the batch size, and + :math:`E_v` is the value embedding dimension ``vdim``. See "Attention Is All You Need" for more details. + key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key`` + to ignore for the purpose of attention (i.e. treat as "padding"). Binary and byte masks are supported. + For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for + the purpose of attention. For a byte mask, a non-zero value indicates that the corresponding ``key`` + value will be ignored. + need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``. + Default: ``True``. + attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape + :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size, + :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be + broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch. + Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the + corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the + corresponding position is not allowed to attend. For a float mask, the mask values will be added to + the attention weight. + Outputs: + - **attn_output** - Attention outputs of shape :math:`(L, N, E)` when ``batch_first=False`` or + :math:`(N, L, E)` when ``batch_first=True``, where :math:`L` is the target sequence length, :math:`N` is + the batch size, and :math:`E` is the embedding dimension ``embed_dim``. + - **attn_output_weights** - Attention output weights of shape :math:`(N, L, S)`, where :math:`N` is the batch + size, :math:`L` is the target sequence length, and :math:`S` is the source sequence length. Only returned + when ``need_weights=True``. + """ + if self.batch_first: + query, key, value = [x.transpose(1, 0) for x in (query, key, value)] + + if not self._qkv_same_embed_dim: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, key, value, self.embed_dim, self.num_heads, + self.in_proj_weight, self.in_proj_bias, + self.bias_k, self.bias_v, self.add_zero_attn, + self.dropout, self.out_proj.weight, self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, need_weights=need_weights, + attn_mask=attn_mask, use_separate_proj_weight=True, + q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight, + v_proj_weight=self.v_proj_weight) + else: + attn_output, attn_output_weights = F.multi_head_attention_forward( + query, key, value, self.embed_dim, self.num_heads, + self.in_proj_weight, self.in_proj_bias, + self.bias_k, self.bias_v, self.add_zero_attn, + self.dropout, self.out_proj.weight, self.out_proj.bias, + training=self.training, + key_padding_mask=key_padding_mask, need_weights=need_weights, + attn_mask=attn_mask) + if self.batch_first: + return attn_output.transpose(1, 0), attn_output_weights + else: + return attn_output, attn_output_weights +class Transformer(Module): + r"""A transformer model. User is able to modify the attributes as needed. The architecture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. Users can build the BERT(https://arxiv.org/abs/1810.04805) + model with corresponding parameters. + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of encoder/decoder intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False`` (seq, batch, feature). + norm_first: if ``True``, encoder and decoder layers will perform LayerNorms before + other attention and feedforward operations, otherwise after. Default: ``False`` (after). + Examples:: + >>> transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12) + >>> src = torch.rand((10, 32, 512)) + >>> tgt = torch.rand((20, 32, 512)) + >>> out = transformer_model(src, tgt) + Note: A full example to apply nn.Transformer module for the word language model is available in + https://github.com/pytorch/examples/tree/master/word_language_model + """ + + def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6, + num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1, + activation: Union[str, Callable[[Tensor], Tensor]] = F.relu, + custom_encoder: Optional[Any] = None, custom_decoder: Optional[Any] = None, + layer_norm_eps: float = 1e-5, batch_first: bool = False, norm_first: bool = False, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super(Transformer, self).__init__() + + if custom_encoder is not None: + self.encoder = custom_encoder + else: + encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, layer_norm_eps, batch_first, norm_first, + **factory_kwargs) + encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, + activation, layer_norm_eps, batch_first, norm_first, + **factory_kwargs) + decoder_norm = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + self.batch_first = batch_first + + def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor: + r"""Take in and process masked source/target sequences. + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + src_mask: the additive mask for the src sequence (optional). + tgt_mask: the additive mask for the tgt sequence (optional). + memory_mask: the additive mask for the encoder output (optional). + src_key_padding_mask: the ByteTensor mask for src keys per batch (optional). + tgt_key_padding_mask: the ByteTensor mask for tgt keys per batch (optional). + memory_key_padding_mask: the ByteTensor mask for memory keys per batch (optional). + Shape: + - src: :math:`(S, N, E)`, `(N, S, E)` if batch_first. + - tgt: :math:`(T, N, E)`, `(N, T, E)` if batch_first. + - src_mask: :math:`(S, S)`. + - tgt_mask: :math:`(T, T)`. + - memory_mask: :math:`(T, S)`. + - src_key_padding_mask: :math:`(N, S)`. + - tgt_key_padding_mask: :math:`(N, T)`. + - memory_key_padding_mask: :math:`(N, S)`. + Note: [src/tgt/memory]_mask ensures that position i is allowed to attend the unmasked + positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend + while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` + are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor + is provided, it will be added to the attention weight. + [src/tgt/memory]_key_padding_mask provides specified elements in the key to be ignored by + the attention. If a ByteTensor is provided, the non-zero positions will be ignored while the zero + positions will be unchanged. If a BoolTensor is provided, the positions with the + value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. + - output: :math:`(T, N, E)`, `(N, T, E)` if batch_first. + Note: Due to the multi-head attention architecture in the transformer model, + the output sequence length of a transformer is same as the input sequence + (i.e. target) length of the decode. + where S is the source sequence length, T is the target sequence length, N is the + batch size, E is the feature number + Examples: + >>> output = transformer_model(src, tgt, src_mask=src_mask, tgt_mask=tgt_mask) + """ + + if not self.batch_first and src.size(1) != tgt.size(1): + raise RuntimeError("the batch number of src and tgt must be equal") + elif self.batch_first and src.size(0) != tgt.size(0): + raise RuntimeError("the batch number of src and tgt must be equal") + + if src.size(2) != self.d_model or tgt.size(2) != self.d_model: + raise RuntimeError("the feature number of src and tgt must be equal to d_model") + + memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask) + output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + return output + + @staticmethod + def generate_square_subsequent_mask(sz: int) -> Tensor: + r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + return torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1) + + def _reset_parameters(self): + r"""Initiate parameters in the transformer model.""" + + for p in self.parameters(): + if p.dim() > 1: + xavier_uniform_(p) + + +class TransformerEncoder(Module): + r"""TransformerEncoder is a stack of N encoder layers + Args: + encoder_layer: an instance of the TransformerEncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + norm: the layer normalization component (optional). + Examples:: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) + >>> transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 32, 512) + >>> out = transformer_encoder(src) + """ + __constants__ = ['norm'] + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src: Tensor, mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor: + r"""Pass the input through the encoder layers in turn. + Args: + src: the sequence to the encoder (required). + mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + Shape: + see the docs in Transformer class. + """ + output = src + + for mod in self.layers: + output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(Module): + r"""TransformerDecoder is a stack of N decoder layers + Args: + decoder_layer: an instance of the TransformerDecoderLayer() class (required). + num_layers: the number of sub-decoder-layers in the decoder (required). + norm: the layer normalization component (optional). + Examples:: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) + >>> transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6) + >>> memory = torch.rand(10, 32, 512) + >>> tgt = torch.rand(20, 32, 512) + >>> out = transformer_decoder(tgt, memory) + """ + __constants__ = ['norm'] + + def __init__(self, decoder_layer, num_layers, norm=None): + super(TransformerDecoder, self).__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None) -> Tensor: + r"""Pass the inputs (and mask) through the decoder layer in turn. + Args: + tgt: the sequence to the decoder (required). + memory: the sequence from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + Shape: + see the docs in Transformer class. + """ + output = tgt + + for mod in self.layers: + output = mod(output, memory, tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + + if self.norm is not None: + output = self.norm(output) + + return output + +class TransformerEncoderLayer(Module): + r"""TransformerEncoderLayer is made up of self-attn and feedforward network. + This standard encoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of the intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False``. + norm_first: if ``True``, layer norm is done prior to attention and feedforward + operations, respectivaly. Otherwise it's done after. Default: ``False`` (after). + Examples:: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> out = encoder_layer(src) + Alternatively, when ``batch_first`` is ``True``: + >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True) + >>> src = torch.rand(32, 10, 512) + >>> out = encoder_layer(src) + """ + __constants__ = ['batch_first', 'norm_first'] + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu, + layer_norm_eps=1e-5, batch_first=False, norm_first=False, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super(TransformerEncoderLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, + **factory_kwargs) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs) + + self.norm_first = norm_first + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = _get_activation_fn(activation) + else: + self.activation = activation + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = F.relu + super(TransformerEncoderLayer, self).__setstate__(state) + + def forward(self, src: Tensor, src_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None) -> Tensor: + r"""Pass the input through the encoder layer. + Args: + src: the sequence to the encoder layer (required). + src_mask: the mask for the src sequence (optional). + src_key_padding_mask: the mask for the src keys per batch (optional). + Shape: + see the docs in Transformer class. + """ + + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + + x = src + if self.norm_first: + x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask) + x = x + self._ff_block(self.norm2(x)) + else: + x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask)) + x = self.norm2(x + self._ff_block(x)) + + return x + + # self-attention block + def _sa_block(self, x: Tensor, + attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor: + x = self.self_attn(x, x, x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False)[0] + return self.dropout1(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + +class TransformerDecoderLayer(Module): + r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. + This standard decoder layer is based on the paper "Attention Is All You Need". + Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, + Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in + Neural Information Processing Systems, pages 6000-6010. Users may modify or implement + in a different way during application. + Args: + d_model: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + activation: the activation function of the intermediate layer, can be a string + ("relu" or "gelu") or a unary callable. Default: relu + layer_norm_eps: the eps value in layer normalization components (default=1e-5). + batch_first: If ``True``, then the input and output tensors are provided + as (batch, seq, feature). Default: ``False``. + norm_first: if ``True``, layer norm is done prior to self attention, multihead + attention and feedforward operations, respectivaly. Otherwise it's done after. + Default: ``False`` (after). + Examples:: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8) + >>> memory = torch.rand(10, 32, 512) + >>> tgt = torch.rand(20, 32, 512) + >>> out = decoder_layer(tgt, memory) + Alternatively, when ``batch_first`` is ``True``: + >>> decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=True) + >>> memory = torch.rand(32, 10, 512) + >>> tgt = torch.rand(32, 20, 512) + >>> out = decoder_layer(tgt, memory) + """ + __constants__ = ['batch_first', 'norm_first'] + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation=F.relu, + layer_norm_eps=1e-5, batch_first=False, norm_first=False, + device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super(TransformerDecoderLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, + **factory_kwargs) + self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first, + **factory_kwargs) + # Implementation of Feedforward model + self.linear1 = Linear(d_model, dim_feedforward, **factory_kwargs) + self.dropout = Dropout(dropout) + self.linear2 = Linear(dim_feedforward, d_model, **factory_kwargs) + + self.norm_first = norm_first + self.norm1 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm2 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.norm3 = LayerNorm(d_model, eps=layer_norm_eps, **factory_kwargs) + self.dropout1 = Dropout(dropout) + self.dropout2 = Dropout(dropout) + self.dropout3 = Dropout(dropout) + + # Legacy string support for activation function. + if isinstance(activation, str): + self.activation = _get_activation_fn(activation) + else: + self.activation = activation + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = F.relu + super(TransformerDecoderLayer, self).__setstate__(state) + + def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor: + r"""Pass the inputs (and mask) through the decoder layer. + Args: + tgt: the sequence to the decoder layer (required). + memory: the sequence from the last layer of the encoder (required). + tgt_mask: the mask for the tgt sequence (optional). + memory_mask: the mask for the memory sequence (optional). + tgt_key_padding_mask: the mask for the tgt keys per batch (optional). + memory_key_padding_mask: the mask for the memory keys per batch (optional). + Shape: + see the docs in Transformer class. + """ + # see Fig. 1 of https://arxiv.org/pdf/2002.04745v1.pdf + + x = tgt + if self.norm_first: + x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask) + x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask) + x = x + self._ff_block(self.norm3(x)) + else: + x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask)) + x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask)) + x = self.norm3(x + self._ff_block(x)) + + return x + + # self-attention block + def _sa_block(self, x: Tensor, + attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor: + x = self.self_attn(x, x, x, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False)[0] + return self.dropout1(x) + + # multihead attention block + def _mha_block(self, x: Tensor, mem: Tensor, + attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor: + x = self.multihead_attn(x, mem, mem, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False)[0] + return self.dropout2(x) + + # feed forward block + def _ff_block(self, x: Tensor) -> Tensor: + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout3(x) + + +def _get_clones(module, N): + return ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation): + if activation == "relu": + return F.relu + elif activation == "gelu": + return F.gelu + + raise RuntimeError("activation should be relu/gelu, not {}".format(activation)) +class PositionalEncoding(nn.Module): + def __init__(self, + emb_size: int, + dropout: float, + maxlen: int = 200): + super(PositionalEncoding, self).__init__() + den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size) + pos = torch.arange(0, maxlen).reshape(maxlen, 1) + pos_embedding = torch.zeros((maxlen, emb_size)) + pos_embedding[:, 0::2] = torch.sin(pos * den) + pos_embedding[:, 1::2] = torch.cos(pos * den) + pos_embedding = pos_embedding.unsqueeze(-2) + + self.dropout = nn.Dropout(dropout) + self.register_buffer('pos_embedding', pos_embedding) + + def forward(self, token_embedding: Tensor): + return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :]) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_ATTENTION]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.transformer_model = Transformer(d_model=config[varables.DIM_ATTENTION], + nhead=config[varables.NUM_HEADS], + dim_feedforward=config[varables.DIM_FEEDFORWARD], + num_encoder_layers=config[varables.NUM_ENCODER_LAYERS], + num_decoder_layers=config[varables.NUM_DECODER_LAYERS], + dropout=config[varables.RATE_DROPOUT], + activation='gelu', + batch_first=True, + # device=config[varables.DEVICE] + ) + self.generator = nn.Linear(config[varables.DIM_ATTENTION], config[varables.SIZE_VOCAB]) + self.apply(self._init_weights) + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = None + max_len = max([len(a) for a in x_in]) + x_in = torch.tensor([a+[vocab[varables.TOKEN_PAD]]*(max_len-len(a)) for a in x_in],dtype=torch.long) + max_len = max([len(a) for a in y_in]) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len-len(a))) for a in y_in],dtype=torch.long) + return x_in,y,boundary + return collate + def forward(self, x_in,y_in, y_out=None,boundary=None): + _, t_x = x_in.size() + _, t_y = y_in.size() + x_token_embeddings = self.tok_emb(x_in) + y_token_embeddings = self.tok_emb(y_in) + x_position_embeddings = self.pos_emb[:, :t_x, :] + y_position_embeddings = self.pos_emb[:, :t_y, :] + x = self.drop(x_token_embeddings + x_position_embeddings) + y = self.drop(y_token_embeddings + y_position_embeddings) + decoder_mask = self.transformer_model.generate_square_subsequent_mask(t_y).to(y_in.device) + # x = x.transpose(1,0) + # x = x.permute(1,0) + out = self.transformer_model(x,y,tgt_mask=decoder_mask) + logits = self.generator(out) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# diff --git a/SCMG/models/Transformer_Torch/sampler.py b/SCMG/models/Transformer_Torch/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..d23455c94a16e1150337d6546e06ff377d2840d5 --- /dev/null +++ b/SCMG/models/Transformer_Torch/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21):x +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug copy/__init__.py b/SCMG/models/Transformer_debug copy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug copy/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug copy/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..854980ad97e8eb0251b4f065dd8e01e4be27bd03 Binary files /dev/null and b/SCMG/models/Transformer_debug copy/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug copy/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug copy/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed61eeccea28a6e038f305c849eb4880e426d497 Binary files /dev/null and b/SCMG/models/Transformer_debug copy/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug copy/model copy 2.py b/SCMG/models/Transformer_debug copy/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug copy/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug/__init__.py b/SCMG/models/Transformer_debug/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df47ecfe25d9c6962cf6ac035c2f45f5bb69f9a4 Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5003b9a4d37a4a31c4a9d3f12fa20b9eb69c37a4 Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fc05e2dc483ae6919a1d00d1678a15057f11247 Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d89af47faf24df819c1dff62a7e5216fed49311e Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81632f9311fa817ec3b38eeebebe3d15e43abab0 Binary files /dev/null and b/SCMG/models/Transformer_debug/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug/model copy 2.py b/SCMG/models/Transformer_debug/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug/model copy.py b/SCMG/models/Transformer_debug/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug/model.py b/SCMG/models/Transformer_debug/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b7845521bf9f5f226fc10a47720fef0e9b6d7cd8 --- /dev/null +++ b/SCMG/models/Transformer_debug/model.py @@ -0,0 +1,275 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +# class PositionalEncoder(nn.Module): +# def __init__(self, config): +# super().__init__() +# pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION]) +# for pos in range(config[varables.SIZE_BLOCK]): +# for i in range(0, config[varables.DIM_ATTENTION], 2): +# pe[pos, i] = \ +# math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION]))) +# pe[pos, i + 1] = \ +# math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION]))) +# pe = pe.unsqueeze(0) +# self.register_buffer('pe', pe) +# def forward(self, T): +# #add constant to embedding +# x = Variable(self.pe[:,:T], requires_grad=False) +# return x + + + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Dropout1(self.Attention( X_Encoder, X_Encoder, Mask_Encoder))) + X_Encoder = self.LayerNorm2(X_Encoder + self.Dropout2(self.FeedForward(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.Dropout1(self.AttentionMasked(X_Decoder, X_Decoder, Mask_Decoder))) + X_Decoder = self.LayerNorm2(X_Decoder + self.Dropout2(self.AttentionCross (X_Encoder, X_Decoder, Mask_Cross))) + X_Decoder = self.LayerNorm3(X_Decoder + self.Dropout3(self.FeedForward( X_Decoder ))) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + X_Encoder = self.LayerNorm1(X_Encoder) + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # Decoder blocks + X_Decoder = self.LayerNorm2(X_Decoder) + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + + + + + + + + + + + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug/sampler.py b/SCMG/models/Transformer_debug/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug2 copy/__init__.py b/SCMG/models/Transformer_debug2 copy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a26e17bc9adfde2f235fb28f005491923c2839ed Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85ce1889aed10a33dcfafb5baac060711904bb26 Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f368fbe64e9c3d744c1185c7bd01619da5448da0 Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce98387b4d198842c3e809383a268f4ab9e99b58 Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2 copy/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug2 copy/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64918d3bb932c62cfeb87135b069381c5fac1f5e Binary files /dev/null and b/SCMG/models/Transformer_debug2 copy/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2 copy/model copy 2.py b/SCMG/models/Transformer_debug2 copy/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug2 copy/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug2 copy/model copy.py b/SCMG/models/Transformer_debug2 copy/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug2 copy/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug2 copy/model.py b/SCMG/models/Transformer_debug2 copy/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d93c209eece94f1cbce12f9907ce3edf18017f6f --- /dev/null +++ b/SCMG/models/Transformer_debug2 copy/model.py @@ -0,0 +1,278 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +# class PositionalEncoder(nn.Module): +# def __init__(self, config): +# super().__init__() +# pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION]) +# for pos in range(config[varables.SIZE_BLOCK]): +# for i in range(0, config[varables.DIM_ATTENTION], 2): +# pe[pos, i] = \ +# math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION]))) +# pe[pos, i + 1] = \ +# math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION]))) +# pe = pe.unsqueeze(0) +# self.register_buffer('pe', pe) +# def forward(self, T): +# #add constant to embedding +# x = Variable(self.pe[:,:T], requires_grad=False) +# return x + + + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.Dropout1(X_Encoder + self.Attention (self.LayerNorm1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.Dropout2(X_Encoder + self.FeedForward(self.LayerNorm2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross ( X_Encoder, self.LayerNorm2(X_Decoder), Mask_Cross )) + X_Decoder = self.Dropout3(X_Decoder + self.FeedForward (self.LayerNorm3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + + + + + + + + + + + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug2 copy/sampler.py b/SCMG/models/Transformer_debug2 copy/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug2 copy/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug2/__init__.py b/SCMG/models/Transformer_debug2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug2/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f1f423a143412b6be2ebd2b886b4a42ee570950 Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..193c50af6a84dfa33e102f8745bfe92e0d2307fb Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72f2ea02a6efe9f2dbb7ff61c903adcfbc00bb2a Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c104e3963e9e4b5c6f346c3c913a91b31996503a Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug2/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..121749b9d57b595bd84555e9d479c65a4e787a61 Binary files /dev/null and b/SCMG/models/Transformer_debug2/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug2/model copy 2.py b/SCMG/models/Transformer_debug2/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug2/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug2/model copy.py b/SCMG/models/Transformer_debug2/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug2/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug2/model.py b/SCMG/models/Transformer_debug2/model.py new file mode 100644 index 0000000000000000000000000000000000000000..a3190f0c3ef2b4e5f0f10aea05f644b7514b73ad --- /dev/null +++ b/SCMG/models/Transformer_debug2/model.py @@ -0,0 +1,246 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.Dropout1(X_Encoder + self.Attention (self.LayerNorm1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.Dropout2(X_Encoder + self.FeedForward(self.LayerNorm2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross ( X_Encoder, self.LayerNorm2(X_Decoder), Mask_Cross )) + X_Decoder = self.Dropout3(X_Decoder + self.FeedForward (self.LayerNorm3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug2/sampler.py b/SCMG/models/Transformer_debug2/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug2/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug3/__init__.py b/SCMG/models/Transformer_debug3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug3/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45e210b6d15ad235e89217d27cfaa8c06fb1fc8c Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug3/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9539e1e0a1cd72ba4f1393948122ccbb8788a395 Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug3/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..39991c790e37f0536bf046f6521ecb60bdf104c0 Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug3/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7700db3b42c92211764fb4960ab77403a185fdf3 Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug3/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug3/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c4a7514350571c1f5cd6ca4f86c5e4c52b4ec339 Binary files /dev/null and b/SCMG/models/Transformer_debug3/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug3/model copy 2.py b/SCMG/models/Transformer_debug3/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug3/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug3/model copy.py b/SCMG/models/Transformer_debug3/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug3/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug3/model.py b/SCMG/models/Transformer_debug3/model.py new file mode 100644 index 0000000000000000000000000000000000000000..89dc6f950abf5fdada62d6ccf8a03a35a023838c --- /dev/null +++ b/SCMG/models/Transformer_debug3/model.py @@ -0,0 +1,246 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug3/sampler.py b/SCMG/models/Transformer_debug3/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug3/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug4/__init__.py b/SCMG/models/Transformer_debug4/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug4/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e81dabbc6858842866a25fce6f5a687851f4bdc0 Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug4/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1eb8e0473949361ebb286e754b300fe629ae758f Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug4/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59ec6adc363dace3cdf8e4b624d26e586ecc2c6e Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug4/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42e7a52dc36efcd93f42fceb1fadbae91367108e Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug4/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug4/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b54c1a446a8731d09ddfa65733a51997720df20 Binary files /dev/null and b/SCMG/models/Transformer_debug4/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug4/model copy 2.py b/SCMG/models/Transformer_debug4/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug4/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug4/model copy.py b/SCMG/models/Transformer_debug4/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug4/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug4/model.py b/SCMG/models/Transformer_debug4/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3fcb7fff9780ec73f3d5c97609a0ded244d4fb6f --- /dev/null +++ b/SCMG/models/Transformer_debug4/model.py @@ -0,0 +1,246 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Embedding = config[varables.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug4/sampler.py b/SCMG/models/Transformer_debug4/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug4/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug5 copy/__init__.py b/SCMG/models/Transformer_debug5 copy/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67aab376b9f41c82542940cf66ac782c49b5b095 Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..129871a4e252bdddf64e15662997b5fac4f2bfd0 Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8953b8d78bbd2dc0497858c6c99177a3d942936 Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db32e645e4f11379b26cae6e0841a1539e6cc8aa Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5 copy/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug5 copy/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ecda0accfc6a66be21620b0fed9e3e41bcc497a Binary files /dev/null and b/SCMG/models/Transformer_debug5 copy/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5 copy/model copy 2.py b/SCMG/models/Transformer_debug5 copy/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug5 copy/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug5 copy/model copy.py b/SCMG/models/Transformer_debug5 copy/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug5 copy/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug5 copy/model.py b/SCMG/models/Transformer_debug5 copy/model.py new file mode 100644 index 0000000000000000000000000000000000000000..10b197b2041a497d9f3b929a9e73ec0d267167ab --- /dev/null +++ b/SCMG/models/Transformer_debug5 copy/model.py @@ -0,0 +1,249 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Embedding = config[varables.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss = F.nll_loss(F.log_softmax(Y_Decoder_Logits).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug5 copy/sampler.py b/SCMG/models/Transformer_debug5 copy/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug5 copy/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug5/__init__.py b/SCMG/models/Transformer_debug5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug5/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4f4ce893aa53a9dc96895f7acf5c827cc60d3e1c Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f5ed60720263fd980189ce4e223ee6c8dfdd401 Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bfa1b683dc47e5ff8a5279b1c3e5ea64eeec8aa1 Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af9ac0f069787a05b4921c07a3c540c1712368e2 Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug5/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f9b18a160658196fcfc18fdf806256ed172319a Binary files /dev/null and b/SCMG/models/Transformer_debug5/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug5/model copy 2.py b/SCMG/models/Transformer_debug5/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug5/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug5/model copy.py b/SCMG/models/Transformer_debug5/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug5/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug5/model.py b/SCMG/models/Transformer_debug5/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8cba0822981298a6d9c005956bf3f2f09cffda00 --- /dev/null +++ b/SCMG/models/Transformer_debug5/model.py @@ -0,0 +1,249 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Embedding = config[varables.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug5/sampler.py b/SCMG/models/Transformer_debug5/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug5/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug6/__init__.py b/SCMG/models/Transformer_debug6/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug6/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f10947e7e58bff44a1ab74857fa90f26501f659 Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug6/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d7cea1e4d376ba38a421edbc4d0780c65ba03bb Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug6/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90f730deaeb8c96ef171c73eebd4cdf347e04c53 Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug6/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0302bd9b2cb1b0f7340e80e3bddfcbf30178baf6 Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug6/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_debug6/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce5084065c1b9fc09cece653c844a7bae1761ae9 Binary files /dev/null and b/SCMG/models/Transformer_debug6/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug6/model copy 2.py b/SCMG/models/Transformer_debug6/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_debug6/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug6/model copy.py b/SCMG/models/Transformer_debug6/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_debug6/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_debug6/model.py b/SCMG/models/Transformer_debug6/model.py new file mode 100644 index 0000000000000000000000000000000000000000..590d2311f738ae72ec2f96833d49a1912944755c --- /dev/null +++ b/SCMG/models/Transformer_debug6/model.py @@ -0,0 +1,249 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Embedding = config[varables.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss1 = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits)) + return Y_Decoder_Logits, loss1+loss2 + + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/Transformer_debug6/sampler.py b/SCMG/models/Transformer_debug6/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_debug6/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/Transformer_debug7/__init__.py b/SCMG/models/Transformer_debug7/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug7/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug7/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29099ec7592db2013e76a7bc4927092d1f6b863b Binary files /dev/null and b/SCMG/models/Transformer_debug7/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug7/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug7/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecc0a192b0932cb2f168d0801551de57acbf6bfd Binary files /dev/null and b/SCMG/models/Transformer_debug7/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug7/model.py b/SCMG/models/Transformer_debug7/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7728093143a64c981c21da51a2ab03af262e6df8 --- /dev/null +++ b/SCMG/models/Transformer_debug7/model.py @@ -0,0 +1,233 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Embedding = config[varables.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss1 = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits)) + return Y_Decoder_Logits, loss1+loss2 \ No newline at end of file diff --git a/SCMG/models/Transformer_debug8/__init__.py b/SCMG/models/Transformer_debug8/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug8/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug8/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d605ab90fb3d410739fee3b7401a585edb7f44b Binary files /dev/null and b/SCMG/models/Transformer_debug8/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug8/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug8/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f35af746c45f932d67cd854ebcb8fcc8704c4c2b Binary files /dev/null and b/SCMG/models/Transformer_debug8/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug8/model.py b/SCMG/models/Transformer_debug8/model.py new file mode 100644 index 0000000000000000000000000000000000000000..fc61da6e3d71be959658010e6639179b3c8f2425 --- /dev/null +++ b/SCMG/models/Transformer_debug8/model.py @@ -0,0 +1,245 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[varables.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Embedding = config[varables.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = [a[2] for a in results] + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD] for _ in range(max_len_x-len(a))]) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD] for _ in range(max_len_y-len(a))]) for a in X_Decoder],dtype=torch.long) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD] for _ in range(max_len_x-len(a))]) for a in X_Encoder],dtype=torch.long) + return x,y,boundary + return collate + def customize_fn(self,diex): + bos_token = diex[VBS.COLUMN_TASK_TYPE] + x_in = self.tokenizer(diex[VBS.COLUMN_ENCODER]) + y_in = self.tokenizer(diex[VBS.COLUMN_DECODER]) + if len(x_in)>0: + x_in = [bos_token] + x_in + [VBS.TOKEN_END] + y_in = [bos_token] + y_in + [VBS.TOKEN_END] + x_in = [self.vocab_encoder[a] if a in self.vocab_encoder.keys() else self.vocab_encoder[""] for a in x_in ] + y_in = [self.vocab_decoder[a] if a in self.vocab_decoder.keys() else self.vocab_decoder[""] for a in y_in ] + boundary = len(x_in)+1 + return x_in,y_in,boundary + + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + # loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss1 = F.nll_loss(F.log_softmax(Y_Decoder_Logits,dim=-1).view(-1, Y_Decoder_Logits.size(-1)),Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits)) + return Y_Decoder_Logits, loss1+loss2 \ No newline at end of file diff --git a/SCMG/models/Transformer_debug9/__init__.py b/SCMG/models/Transformer_debug9/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_debug9/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_debug9/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9faba93282737bc6341ad1a160c21f0ef9a54a0 Binary files /dev/null and b/SCMG/models/Transformer_debug9/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug9/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_debug9/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb3989b462ce7c3e35e4925ddba0e7e59ce8debc Binary files /dev/null and b/SCMG/models/Transformer_debug9/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_debug9/model.py b/SCMG/models/Transformer_debug9/model.py new file mode 100644 index 0000000000000000000000000000000000000000..813184b65ea5a370097a352b0db0487fe992264e --- /dev/null +++ b/SCMG/models/Transformer_debug9/model.py @@ -0,0 +1,364 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F +import partialsmiles as ps +# logger = logging.getLogger(__name__) +from SCMG.config import varables as VBS +from torch.autograd import Variable +import partialsmiles as ps +from SCMG.utils.utils_rsd import * +from rdkit import Chem +from rdkit import RDLogger +RDLogger.DisableLog('rdApp.*') + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[VBS.RATE_DROPOUT]) + max_len = config[VBS.SIZE_BLOCK] + pe = torch.zeros(max_len, config[VBS.DIM_EMBEDDING]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[VBS.DIM_EMBEDDING], 2).float() * (-math.log(10000.0) / config[VBS.DIM_EMBEDDING])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[VBS.DIM_ATTENTION] % config[VBS.NUM_HEADS] == 0 + self.Key = nn.Linear(config[VBS.DIM_EMBEDDING], config[VBS.DIM_ATTENTION]) + self.Query = nn.Linear(config[VBS.DIM_EMBEDDING], config[VBS.DIM_ATTENTION]) + self.Value = nn.Linear(config[VBS.DIM_EMBEDDING], config[VBS.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Projection = nn.Linear(config[VBS.DIM_ATTENTION], config[VBS.DIM_EMBEDDING]) + self.NumberOfHeads = config[VBS.NUM_HEADS] + self.DimHead = config[VBS.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[VBS.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[VBS.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[VBS.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[VBS.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[VBS.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[VBS.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[VBS.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Attention = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,Mask_Encoder): + X_Encoder = self.LayerNorm1(X_Encoder + self.Attention (self.Dropout1(X_Encoder), None, Mask_Encoder)) + X_Encoder = self.LayerNorm2(X_Encoder + self.FeedForward(self.Dropout2(X_Encoder))) + return X_Encoder + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.LayerNorm1(X_Decoder + self.AttentionMasked(self.Dropout1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.LayerNorm2(X_Decoder + self.AttentionCross ( X_Encoder, self.Dropout2(X_Decoder), Mask_Cross )) + X_Decoder = self.LayerNorm3(X_Decoder + self.FeedForward (self.Dropout3(X_Decoder) )) + return X_Decoder + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # VBS + self.Dim_Embedding = config[VBS.DIM_EMBEDDING] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[VBS.DIM_EMBEDDING]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[VBS.DIM_EMBEDDING]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[VBS.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[VBS.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[VBS.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[VBS.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[VBS.DIM_EMBEDDING], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + self.Alpha_LabelSmoothing = None + self.TokenWeight = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _set_train_params(self,Config): + self.Alpha_LabelSmoothing = Config["Alpha_LabelSmoothing"] + self.TokenWeight = Config["TokenWeight"] + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[VBS.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[VBS.SIZE_STEP], gamma=train_config[VBS.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + Auxiliary = [a[2] for a in results] + # + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + # + x = torch.tensor([(a+[vocab_encoder[VBS.TOKEN_PAD] for _ in range(max_len_x-len(a))]) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[VBS.TOKEN_PAD] for _ in range(max_len_y-len(a))]) for a in X_Decoder],dtype=torch.long) + if isinstance(Auxiliary[0],list): + MaxLen_Auxiliary = max([len(TruthTable) for TruthTable in Auxiliary]) + Len_Vocab = len(self.List_Vocab_Decoder) + Auxiliary = torch.tensor([TruthTable+[[0 for _ in range(Len_Vocab)] for _ in range(MaxLen_Auxiliary-len(TruthTable))] for TruthTable in Auxiliary]) + ## + # + return x,y,Auxiliary + return collate + def customize_model_fn(self,diex): + def fn(diex): + bos_token = diex[VBS.COLUMN_TASK_TYPE] + # Encoder + x_in = self.tokenizer(diex[VBS.COLUMN_ENCODER]) + if len(x_in)>0: + x_in = [bos_token] + x_in + [VBS.TOKEN_END] + x_in = [self.vocab_encoder[a] if a in self.vocab_encoder.keys() else self.vocab_encoder[""] for a in x_in ] + # Decoder + y_in = self.tokenizer(diex[VBS.COLUMN_DECODER]) + y_in = [bos_token] + y_in + [VBS.TOKEN_END] + # Auxiliary + ## 1. partial + ## Is Valid + TruthTable = [] + for CurrentIndex in range(1,len(y_in)): + if (y_in[CurrentIndex] == "|" or "<" in y_in[CurrentIndex]) and y_in[CurrentIndex] != VBS.TOKEN_END: + TruthTable.append([0 for _ in range(len(self.List_Vocab_Decoder))]) + continue + CurrentTruthTable = [] + for CurrentToken in self.List_Vocab_Decoder: + try: + _ = ps.ParseSmiles("".join(y_in[1:CurrentIndex])+CurrentToken, partial=True) + IsValid = 1 + except: + IsValid = 0 + if CurrentToken == VBS.TOKEN_END: + CurrentSMI = join_scaf_deco(diex[VBS.COLUMN_ENCODER],"".join(y_in[1:CurrentIndex])) + if len(CurrentSMI) > 0: + IsValid = 1 + CurrentTruthTable.append(IsValid) + TruthTable.append(CurrentTruthTable) + # StrPrint = "".join([f"{a:3}" for a in TruthTable]) + # print(f'''{y_in[i][:5]:5} {StrPrint}''') + y_in = [self.vocab_decoder[a] if a in self.vocab_decoder.keys() else self.vocab_decoder[""] for a in y_in ] + Auxiliary = TruthTable + return x_in,y_in,Auxiliary + return fn + def generate_masks(self,X_Encoder, X_Decoder): + with torch.no_grad(): + # Generate encoder, decoder, cross masks + T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,Auxiliary=None): + Mask_Encoder, Mask_Decoder,Mask_Cross = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Encoder = self.Dropout1(self.Embedding_Encoder(X_Encoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Encoder.size(1))) + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Embedding) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Encoder blocks + for encoder_block in self.encoder_blocks: + X_Encoder = encoder_block(X_Encoder,Mask_Encoder) + # X_Encoder = self.LayerNorm1(X_Encoder) + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder) + # X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder) + loss = None + if Y_Decoder_Ref is not None: + with torch.no_grad(): + Y_OneHot = F.one_hot(Y_Decoder_Ref, num_classes=len(self.vocab_decoder)) * (1-self.Alpha_LabelSmoothing) + # LabelSmooth + LabelSmooth = torch.ones(len(self.List_Vocab_Decoder),device = Y_Decoder_Ref.device) * self.Alpha_LabelSmoothing / (len(self.List_Vocab_Decoder)-1) + Y_OneHot = Y_OneHot + LabelSmooth + # PartialSMILES + TruthTables = Auxiliary + Y_OneHot = Y_OneHot * TruthTables + # TokenWeight + if self.TokenWeight is not None: + Weight = torch.tensor( + self.TokenWeight, + device = Y_Decoder_Ref.device).unsqueeze(0).unsqueeze(0) + Y_OneHot = Y_OneHot * Weight + # IgnoreIndex + Y_OneHot[Y_Decoder_Ref==self.Token_Padding_Decoder] = 0. + Y_Decoder_Logits_LogSoftmax = F.log_softmax(Y_Decoder_Logits,dim=-1) + loss = -(Y_OneHot * Y_Decoder_Logits_LogSoftmax).sum(dim=-1) + loss = loss.mean() + # loss2 = F.kl_div(F.log_softmax(Y_Decoder_Logits,dim=-1),F.one_hot(Y_Decoder_Ref,num_classes=Y_Decoder_Logits.shape[-1]).type_as(Y_Decoder_Logits)) + return Y_Decoder_Logits, loss + + + +# self = trainer.model_module +# X_Encoder = trainer.X_Encoder +# X_Decoder = trainer.X_Decoder +# Y_Decoder_Ref = trainer.Y_Decoder_Ref +# Auxiliary = trainer.Auxiliary + +# from torch.nn import functional as F +# Y_OneHot = F.one_hot(trainer.Y_Decoder_Ref,num_classes=len(trainer.model.vocab_decoder)) +# import math +# import logging +# import torch +# import torch.nn as nn +# from torch.nn import functional as F +# # logger = logging.getLogger(__name__) +# from SCMG.config import varables as VBS +# from torch.autograd import Variable + +# from SmilesPE.pretokenizer import atomwise_tokenizer +# class debug1(): +# def __init__(self): +# self.tokenizer = atomwise_tokenizer +# self.vocab_encoder = torch.load("vocab_atom.pt") +# self.vocab_decoder = torch.load("vocab_atom.pt") + + +# self = debug1() +# bos_token = "bos_token" +# diex={ +# VBS.COLUMN_ENCODER:"[*]c1cc(NC(=O)c2ccccc2)ccc1F", +# VBS.COLUMN_DECODER:"[*]c1cc(NC(=O)c2cc3c(cn2)OCCO3)ccc1F", +# VBS.COLUMN_TASK_TYPE:"", +# VBS.TOKEN_END:"", +# } +# customize_model_fn(self,diex) + + +# rm -r checkpoints/TFdebug9_512_512_6_20220401_0 +# python -i scripts/create_model_SCMG.py \ +# --model_type=Transformer_debug9 \ +# --model_name=TF_512_512_6_debug9 \ +# --num_decoder_layers=6 \ +# --num_heads=8 \ +# --dim_attention=512 \ +# --dim_feedforward=2048 \ +# --dim_embedding=512 \ +# --rate_dropout=0.2 \ +# --tokenizer=atom \ +# --size_block=300 \ +# --filepath_vocab_encoder=vocab_atom.pt \ +# --filepath_vocab_decoder=vocab_atom.pt \ +# --dirpath_checkpoint=checkpoints/TFdebug9_512_512_6_20220401_0 + +# python \ +# -i \ +# scripts/train/train_SCMG.py \ +# --dirpath_data=PreProcess_DecoderOnly/TrainingSets_EncoderDecoder_OneDecoder/ \ +# --size_batch=192 \ +# --size_step=1500 \ +# --rate_learning=0.0001 \ +# --gamma=0.1 \ +# --num_workers=32 \ +# --epochs=49 \ +# --dirpath_checkpoint=checkpoints/TFdebug9_512_512_6_20220401_0/ \ +# --log_level=INFO \ +# --run_one_epoch=0 \ +# --dry_run=0 \ +# --dump=1 \ +# --Alpha_LabelSmoothing=0.1 \ No newline at end of file diff --git a/SCMG/models/Transformer_ref/__init__.py b/SCMG/models/Transformer_ref/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/Transformer_ref/__pycache__/__init__.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93670b84f2d329c5471f40539da29b6a944d65c0 Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_ref/__pycache__/model copy.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4ff76c3563838c2acaaa6c601de4bef414b594d Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_ref/__pycache__/model.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b85ff8f00454f4d18df378f0ec321b9e2eb255f9 Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_ref/__pycache__/sampler.cpython-310.pyc b/SCMG/models/Transformer_ref/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1439f107862502f9db1af61aee1b966f145584f4 Binary files /dev/null and b/SCMG/models/Transformer_ref/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/Transformer_ref/model copy.py b/SCMG/models/Transformer_ref/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/Transformer_ref/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_ref/model.py b/SCMG/models/Transformer_ref/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/Transformer_ref/model.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/Transformer_ref/sampler.py b/SCMG/models/Transformer_ref/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/Transformer_ref/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/UTFMG/__init__.py b/SCMG/models/UTFMG/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/UTFMG/__pycache__/__init__.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfc264b2a6beca6d69f3c33d2c4d07133347a8f0 Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG/__pycache__/config.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a84a4d30565d0239af5960f63619d6eb331f557f Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/config.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG/__pycache__/model.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18f9508e356c13912d917742f2a18486b94c31ae Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG/__pycache__/sampler.cpython-310.pyc b/SCMG/models/UTFMG/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7d8b4302ef6fdd87e2ad670175c89d8f4c473a69 Binary files /dev/null and b/SCMG/models/UTFMG/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG/config.py b/SCMG/models/UTFMG/config.py new file mode 100644 index 0000000000000000000000000000000000000000..94d854ccf0a67422a41f53a05eb3e4e103a45f18 --- /dev/null +++ b/SCMG/models/UTFMG/config.py @@ -0,0 +1,39 @@ +import argparse + + +def get_parser(parser=None): + if parser is None: + parser = argparse.ArgumentParser() + + # Model + model_arg = parser.add_argument_group('Model') + model_arg.add_argument("--num_layers", type=int, default=3, + help="Number of LSTM layers") + model_arg.add_argument("--hidden", type=int, default=768, + help="Hidden size") + model_arg.add_argument("--dropout", type=float, default=0.2, + help="dropout between LSTM layers except for last") + + # Train + train_arg = parser.add_argument_group('Training') + train_arg.add_argument('--train_epochs', type=int, default=80, + help='Number of epochs for model training') + train_arg.add_argument('--n_batch', type=int, default=64, + help='Size of batch') + train_arg.add_argument('--lr', type=float, default=1e-3, + help='Learning rate') + train_arg.add_argument('--step_size', type=int, default=10, + help='Period of learning rate decay') + train_arg.add_argument('--gamma', type=float, default=0.5, + help='Multiplicative factor of learning rate decay') + train_arg.add_argument('--n_jobs', type=int, default=1, + help='Number of threads') + train_arg.add_argument('--n_workers', type=int, default=1, + help='Number of workers for DataLoaders') + + return parser + + +def get_config(): + parser = get_parser() + return parser.parse_known_args()[0] diff --git a/SCMG/models/UTFMG/model.py b/SCMG/models/UTFMG/model.py new file mode 100644 index 0000000000000000000000000000000000000000..0ccdc295356f91c91f87e264e31f59782249c4c8 --- /dev/null +++ b/SCMG/models/UTFMG/model.py @@ -0,0 +1,133 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None,boundary=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + if boundary is None: + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + else: + mask = torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK]))\ + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])\ + .repeat(B,1,1,1) + for i in range(len(boundary)): + mask[i,0,:boundary[i],::boundary[i]] = 1 + att = att.masked_fill(mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + +class Block(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x, boundary): + # = y_input + x = x + self.attn(self.ln1(x),boundary) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.blocks = nn.ModuleList([Block(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[Block(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab,vocab2): + def collate(results): + x_in = None + y_in = [a[0] + [vocab[varables.TOKEN_SEP]] + a[1] for a in results] + boundary = [a[2] for a in results] + max_len = max([len(a) for a in y_in]) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len-len(a))) for a in y_in],dtype=torch.long) + return x_in,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + b, t = y_in.size() + assert t <= self.block_size + token_embeddings = self.tok_emb(y_in) + position_embeddings = self.pos_emb[:, :t, :] + x = self.drop(token_embeddings + position_embeddings) + for block in self.blocks: + x = block(x,boundary) + x = self.ln_f(x) + logits = self.head(x) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss diff --git a/SCMG/models/UTFMG/sampler.py b/SCMG/models/UTFMG/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/UTFMG/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/UTFMG2/__init__.py b/SCMG/models/UTFMG2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/SCMG/models/UTFMG2/__pycache__/__init__.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d2222dc45997b8d1fbc4ed7b4533e2cec8ca4a4 Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/__init__.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG2/__pycache__/model copy 2.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/model copy 2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f3f27722a68d159b58748f5162c2558768f008f Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/model copy 2.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG2/__pycache__/model copy.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/model copy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43d95eca0861879943db41d5c6171fb4cf48f2f7 Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/model copy.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG2/__pycache__/model.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..758d86f65d5be18c42aece49ea8541107eb339f2 Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/model.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG2/__pycache__/sampler.cpython-310.pyc b/SCMG/models/UTFMG2/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b60eb1e593e7e5687f7467a526d4616c061305f Binary files /dev/null and b/SCMG/models/UTFMG2/__pycache__/sampler.cpython-310.pyc differ diff --git a/SCMG/models/UTFMG2/model copy 2.py b/SCMG/models/UTFMG2/model copy 2.py new file mode 100644 index 0000000000000000000000000000000000000000..8b254fbe02eafd3f7370cfae17eb6729563aa260 --- /dev/null +++ b/SCMG/models/UTFMG2/model copy 2.py @@ -0,0 +1,420 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + + + + + + + + + + + + + + + + + +import torch +import torch.nn as nn +import torch.nn.functional as F +import math + + +class Norm(nn.Module): + def __init__(self, d_model, eps = 1e-6): + super().__init__() + + self.size = d_model + + # create two learnable parameters to calibrate normalisation + self.alpha = nn.Parameter(torch.ones(self.size)) + self.bias = nn.Parameter(torch.zeros(self.size)) + + self.eps = eps + + def forward(self, x): + norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \ + / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias + return norm + +def attention(q, k, v, d_k, mask=None, dropout=None): + + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) + + if mask is not None: + mask = mask.unsqueeze(1) + scores = scores.masked_fill(mask == 0, -1e9) + + scores = F.softmax(scores, dim=-1) + + if dropout is not None: + scores = dropout(scores) + + output = torch.matmul(scores, v) + return output + +class MultiHeadAttention(nn.Module): + def __init__(self, heads, d_model, dropout = 0.1): + super().__init__() + + self.d_model = d_model + self.d_k = d_model // heads + self.h = heads + + self.q_linear = nn.Linear(d_model, d_model) + self.v_linear = nn.Linear(d_model, d_model) + self.k_linear = nn.Linear(d_model, d_model) + + self.dropout = nn.Dropout(dropout) + self.out = nn.Linear(d_model, d_model) + + def forward(self, q, k, v, mask=None): + + bs = q.size(0) + + # perform linear operation and split into N heads + k = self.k_linear(k).view(bs, -1, self.h, self.d_k) + q = self.q_linear(q).view(bs, -1, self.h, self.d_k) + v = self.v_linear(v).view(bs, -1, self.h, self.d_k) + + # transpose to get dimensions bs * N * sl * d_model + k = k.transpose(1,2) + q = q.transpose(1,2) + v = v.transpose(1,2) + + + # calculate attention using function we will define next + scores = attention(q, k, v, self.d_k, mask, self.dropout) + # concatenate heads and put through final linear layer + concat = scores.transpose(1,2).contiguous()\ + .view(bs, -1, self.d_model) + output = self.out(concat) + + return output + +class FeedForward(nn.Module): + def __init__(self, d_model, d_ff=2048, dropout = 0.1): + super().__init__() + + # We set d_ff as a default to 2048 + self.linear_1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.linear_2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.dropout(F.relu(self.linear_1(x))) + x = self.linear_2(x) + return x + + + + +import torch +import torch.nn as nn +import copy + + +class EncoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + + def forward(self, x, mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.ff(x2)) + return x + +# build a decoder layer with two multi-head attention layers and +# one feed-forward layer +class DecoderLayer(nn.Module): + def __init__(self, d_model, heads, dropout=0.1): + super().__init__() + self.norm_1 = Norm(d_model) + self.norm_2 = Norm(d_model) + self.norm_3 = Norm(d_model) + + self.dropout_1 = nn.Dropout(dropout) + self.dropout_2 = nn.Dropout(dropout) + self.dropout_3 = nn.Dropout(dropout) + + self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout) + self.ff = FeedForward(d_model, dropout=dropout) + + def forward(self, x, e_outputs, src_mask, trg_mask): + x2 = self.norm_1(x) + x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) + x2 = self.norm_2(x) + x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \ + src_mask)) + x2 = self.norm_3(x) + x = x + self.dropout_3(self.ff(x2)) + return x + + +import torch +import torch.nn as nn +import math +from torch.autograd import Variable + +class Embedder(nn.Module): + def __init__(self, vocab_size, d_model): + super().__init__() + self.d_model = d_model + self.embed = nn.Embedding(vocab_size, d_model) + def forward(self, x): + return self.embed(x) + +class PositionalEncoder(nn.Module): + def __init__(self, d_model, max_seq_len = 200, dropout = 0.1): + super().__init__() + self.d_model = d_model + self.dropout = nn.Dropout(dropout) + # create constant 'pe' matrix with values dependant on + # pos and i + pe = torch.zeros(max_seq_len, d_model) + for pos in range(max_seq_len): + for i in range(0, d_model, 2): + pe[pos, i] = \ + math.sin(pos / (10000 ** ((2 * i)/d_model))) + pe[pos, i + 1] = \ + math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + + + def forward(self, x): + # make embeddings relatively larger + x = x * math.sqrt(self.d_model) + #add constant to embedding + seq_len = x.size(1) + pe = Variable(self.pe[:,:seq_len], requires_grad=False) + if x.is_cuda: + pe.cuda() + x = x + pe + return self.dropout(x) + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + +class Encoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, src, mask): + x = self.embed(src) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, mask) + return self.norm(x) + +class Decoder(nn.Module): + def __init__(self, vocab_size, d_model, N, heads, dropout): + super().__init__() + self.N = N + self.embed = Embedder(vocab_size, d_model) + self.pe = PositionalEncoder(d_model, dropout=dropout) + self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N) + self.norm = Norm(d_model) + def forward(self, trg, e_outputs, src_mask, trg_mask): + x = self.embed(trg) + x = self.pe(x) + for i in range(self.N): + x = self.layers[i](x, e_outputs, src_mask, trg_mask) + return self.norm(x) + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.encoder = Encoder(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.decoder = Decoder(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION], config[varables.NUM_LAYERS], config[varables.NUM_HEADS], config[varables.RATE_DROPOUT]) + self.out = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"])) + # self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + # self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + # self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + # self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + # self.block_size = config[varables.SIZE_BLOCK] + # self.apply(self._init_weights) + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + def forward(self, src, trg, trg_out, boundary=None): + src_mask = None + trg_mask = torch.tril(torch.ones(trg.shape[1], trg.shape[1])).view(1, 1, trg.shape[1], trg.shape[1]).to(trg.device) + e_outputs = self.encoder(src, src_mask) + d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) + logits = self.out(d_output) + loss = None + if trg_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), trg_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/UTFMG2/model copy.py b/SCMG/models/UTFMG2/model copy.py new file mode 100644 index 0000000000000000000000000000000000000000..85ed98da342e63696371099158471e07cd1bf25c --- /dev/null +++ b/SCMG/models/UTFMG2/model copy.py @@ -0,0 +1,187 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +logger = logging.getLogger(__name__) +from SCMG.config import varables + +# class ModelConfig(): +# rate_dropout_embedding = 0.1 +# rate_dropout_residue = 0.1 +# rate_dropout_attention = 0.1 +# block_size=125 +# def __init__(self, size_vocab, **kwargs): +# self.size_vocab = size_vocab +# for k,v in kwargs.items(): +# setattr(self, k, v) + +class CausalSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + + def forward(self, x, layer_past=None): + B, T, C = x.size() + k = self.key(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x).view(B, T, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B, T, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + +class CrossAttention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.dropout_attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.dropout_residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.n_head = config[varables.NUM_HEADS] + self.single_head_dim = config[varables.DIM_ATTENTION] // self.n_head + self.attention_features = config[varables.DIM_ATTENTION] + self.register_buffer("mask", torch.tril(torch.ones(config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + .view(1, 1, config[varables.SIZE_BLOCK], config[varables.SIZE_BLOCK])) + + def forward(self, x_encoder,x_decoder, layer_past=None): + B_encoder, T_encoder, C_encoder = x_encoder.size() + B_decoder, T_decoder, C_decoder = x_decoder.size() + k = self.key( x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + q = self.query(x_decoder).view(B_encoder, T_decoder, self.n_head,self.single_head_dim).transpose(1, 2) + v = self.value(x_encoder).view(B_encoder, T_encoder, self.n_head,self.single_head_dim).transpose(1, 2) + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) + att = att.masked_fill(self.mask[:,:,:T_decoder,:T_encoder] == 0, float('-inf')) + att = F.softmax(att, dim=-1) + att = self.dropout_attention(att) + y = att @ v + y = y.transpose(1, 2).contiguous().view(B_encoder, T_decoder, self.attention_features) + y = self.dropout_residue(self.projection(y)) + return y + + + + +class EncoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.attn = CausalSelfAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x): + # = y_input + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.ln1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.ln2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.masked_attn = CausalSelfAttention(config) + self.cross_attn = CrossAttention(config) + self.mlp = nn.Sequential( + nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_FEEDFORWARD]), + nn.GELU(), + nn.Linear(config[varables.DIM_FEEDFORWARD], config[varables.DIM_EMBEDDING]), + nn.Dropout(config[varables.RATE_DROPOUT]), + ) + + def forward(self, x_encoder,x): + # = y_input + x = x + self.masked_attn(self.ln1(x)) + x = x + self.cross_attn(x_encoder,self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + return x + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + self.tok_emb = nn.Embedding(config[varables.SIZE_VOCAB], config[varables.DIM_EMBEDDING]) + self.pos_emb = nn.Parameter(torch.zeros(1, config[varables.SIZE_BLOCK], config[varables.DIM_EMBEDDING])) + self.drop = nn.Dropout(config[varables.RATE_DROPOUT]) + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # self.blocks = nn.Sequential(*[DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.ln_f = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.head = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.SIZE_VOCAB], bias=False) + self.block_size = config[varables.SIZE_BLOCK] + self.apply(self._init_weights) + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + self.optimizer = None + + def get_block_size(self): + return self.block_size + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab): + def collate(results): + x_in = [a[0] for a in results] + y_in = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in x_in]) + max_len_y = max([len(a) for a in y_in]) + x = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in x_in],dtype=torch.long) + y = torch.tensor([(a+[vocab[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in y_in],dtype=torch.long) + return x,y,boundary + return collate + + def forward(self, x_in, y_in, y_out=None,boundary=None): + x_in = self.drop(self.tok_emb(x_in) + self.pos_emb[:, :x_in.size()[1], :]) + y_in = self.drop(self.tok_emb(y_in) + self.pos_emb[:, :y_in.size()[1], :]) + # + for encoder_block in self.encoder_blocks: + x_in = encoder_block(x_in) + x_in = self.ln_f(x_in) + for decoder_block in self.decoder_blocks: + y_in = decoder_block(x_in,y_in) + y_in = self.ln_f(y_in) + logits = self.head(y_in) + loss = None + if y_out is not None: + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y_out.view(-1)) + return logits, loss + +# mark test \ No newline at end of file diff --git a/SCMG/models/UTFMG2/model.py b/SCMG/models/UTFMG2/model.py new file mode 100644 index 0000000000000000000000000000000000000000..006fcf5566e614bbb5cecd32f74dd16246b85320 --- /dev/null +++ b/SCMG/models/UTFMG2/model.py @@ -0,0 +1,247 @@ +import math +import logging + +import torch +import torch.nn as nn +from torch.nn import functional as F + +# logger = logging.getLogger(__name__) +from SCMG.config import varables +from torch.autograd import Variable + +# class PositionalEncoder(nn.Module): +# def __init__(self, config): +# super().__init__() +# pe = torch.zeros(config[varables.SIZE_BLOCK], config[varables.DIM_ATTENTION]) +# for pos in range(config[varables.SIZE_BLOCK]): +# for i in range(0, config[varables.DIM_ATTENTION], 2): +# pe[pos, i] = \ +# math.sin(pos / (10000 ** ((2 * i)/config[varables.DIM_ATTENTION]))) +# pe[pos, i + 1] = \ +# math.cos(pos / (10000 ** ((2 * (i + 1))/config[varables.DIM_ATTENTION]))) +# pe = pe.unsqueeze(0) +# self.register_buffer('pe', pe) +# def forward(self, T): +# #add constant to embedding +# x = Variable(self.pe[:,:T], requires_grad=False) +# return x + + + +class PositionalEncoder(nn.Module): + def __init__(self, config): + super(PositionalEncoder, self).__init__() + self.Dropout = nn.Dropout(p=config[varables.RATE_DROPOUT]) + max_len = config[varables.SIZE_BLOCK] + pe = torch.zeros(max_len, config[varables.DIM_ATTENTION]) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, config[varables.DIM_ATTENTION], 2).float() * (-math.log(10000.0) / config[varables.DIM_ATTENTION])) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer('pe', pe) + def forward(self, T): + x = self.Dropout(self.pe[:,:T, :]) + return x + + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + assert config[varables.DIM_ATTENTION] % config[varables.NUM_HEADS] == 0 + self.Key = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Query = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Value = nn.Linear(config[varables.DIM_EMBEDDING], config[varables.DIM_ATTENTION]) + self.Dropout_Attention = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout_Residue = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Projection = nn.Linear(config[varables.DIM_ATTENTION], config[varables.DIM_EMBEDDING]) + self.NumberOfHeads = config[varables.NUM_HEADS] + self.DimHead = config[varables.DIM_ATTENTION] // self.NumberOfHeads + self.DimAttention = config[varables.DIM_ATTENTION] + + def forward(self, X_1,X_2, mask=None): + if X_2 is None: + X_2 = X_1 + BatchSize, T_Encoder, _ = X_1.size() + BatchSize, T_Decoder, _ = X_2.size() + K = self.Key( X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + Q = self.Query(X_2).view(BatchSize, T_Decoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + V = self.Value(X_1).view(BatchSize, T_Encoder, self.NumberOfHeads,self.DimHead).transpose(1, 2) + # k,q,v dimension: (BatchSize, SequenceSize, NumberOfHeads, HeadDimension) 3,4,5,16 + ScoreAttention = (Q @ K.transpose(-2, -1)) / math.sqrt(self.DimHead) + ScoreAttention = ScoreAttention.masked_fill(mask==0, -1e9) + ScoreAttention = F.softmax(ScoreAttention, dim=-1) + ScoreAttention = self.Dropout_Attention(ScoreAttention) + # k.transpose(-2,-1): 3,4,16,5 + # (q@(k.transpose(-2,-1))): 3,4,5,5 + Z = ScoreAttention @ V + # y dimension: 3,4,5,16 + Z = Z.transpose(1, 2).contiguous().view(BatchSize, T_Decoder, self.DimAttention) + # y dimension: 3,5,64 + Z = self.Dropout_Residue(self.Projection(Z)) + return Z + + + + + + + + + + +class FeedForward(nn.Module): + def __init__(self, config): + super().__init__() + if config[varables.DIM_FEEDFORWARD] == 0: + Dim_FeedForward = config[varables.DIM_ATTENTION] *4 + else: + Dim_FeedForward = config[varables.DIM_FEEDFORWARD] + self.Linear1 = nn.Linear(config[varables.DIM_EMBEDDING], Dim_FeedForward) + self.GELU = nn.GELU() + self.Linear2 = nn.Linear(Dim_FeedForward, config[varables.DIM_EMBEDDING]) + self.Dropout = nn.Dropout(config[varables.RATE_DROPOUT]) + + def forward(self,x): + x = self.Linear1(x) + x = self.GELU (x) + x = self.Dropout(x) + x = self.Linear2(x) + return x + +class DecoderBlock(nn.Module): + def __init__(self, config): + super().__init__() + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm3 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout3 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.AttentionMasked = Attention( config) + self.AttentionCross = Attention( config) + self.FeedForward = FeedForward(config) + + def forward(self, X_Encoder,X_Decoder,Mask_Cross,Mask_Decoder): + X_Decoder = self.Dropout1(X_Decoder + self.AttentionMasked(self.LayerNorm1(X_Decoder), None, Mask_Decoder)) + X_Decoder = self.Dropout2(X_Decoder + self.AttentionCross ( X_Encoder, self.LayerNorm2(X_Decoder), Mask_Cross )) + X_Decoder = self.Dropout3(X_Decoder + self.FeedForward (self.LayerNorm3(X_Decoder) )) + return X_Decoder + + + + + + + + + + + + + + + + + + + +class Model(nn.Module): + def __init__(self, config): + super().__init__() + # Varables + self.Dim_Attention = config[varables.DIM_ATTENTION] + self.Token_Padding_Encoder = config["Token_Padding_Encoder"] + self.Token_Padding_Decoder = config["Token_Padding_Decoder"] + # Embedding and positional encoding layers + self.Embedding_Encoder = nn.Embedding(len(config["vocab_encoder"]), config[varables.DIM_ATTENTION]) + self.Embedding_Decoder = nn.Embedding(len(config["vocab_decoder"]), config[varables.DIM_ATTENTION]) + self.pos_emb = PositionalEncoder(config) + # Dropout and normalization layers + self.Dropout1 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.Dropout2 = nn.Dropout(config[varables.RATE_DROPOUT]) + self.LayerNorm1 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + self.LayerNorm2 = nn.LayerNorm(config[varables.DIM_EMBEDDING]) + # Transformer layers + self.encoder_blocks = nn.ModuleList([EncoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + self.decoder_blocks = nn.ModuleList([DecoderBlock(config) for _ in range(config[varables.NUM_LAYERS])]) + # Output layer + self.head = nn.Linear(config[varables.DIM_ATTENTION], len(config["vocab_decoder"]), bias=False) + # Init + self.apply(self._init_weights) + self.optimizer = None + # logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def _init_weights(self, module): + for p in module.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + # if isinstance(module, (nn.Linear, nn.Embedding)): + # module.weight.data.normal_(mean=0.0, std=0.02) + # if isinstance(module, nn.Linear) and module.bias is not None: + # module.bias.data.zero_() + # elif isinstance(module, nn.LayerNorm): + # module.bias.data.zero_() + # module.weight.data.fill_(1.0) + def init_optimizers(self,train_config): + optimizer = torch.optim.Adam(self.parameters(), lr=train_config[varables.RATE_LEARNING]) + return optimizer + def init_scheduler(self,train_config): + scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=train_config[varables.SIZE_STEP], gamma=train_config[varables.GAMMA]) + return scheduler + def get_collate_fn(self, vocab_encoder,vocab_decoder): + def collate(results): + X_Encoder = [a[0] for a in results] + X_Decoder = [a[1] for a in results] + boundary = -1 + max_len_x = max([len(a) for a in X_Encoder]) + max_len_y = max([len(a) for a in X_Decoder]) + x = torch.tensor([(a+[vocab_encoder[varables.TOKEN_PAD]]*(max_len_x-len(a))) for a in X_Encoder],dtype=torch.long) + y = torch.tensor([(a+[vocab_decoder[varables.TOKEN_PAD]]*(max_len_y-len(a))) for a in X_Decoder],dtype=torch.long) + return x,y,boundary + return collate + + def generate_masks(self,X_Encoder, X_Decoder): + # Generate encoder, decoder, cross masks + BatchSize, T_Encoder, _ = X_Encoder.size() + BatchSize, T_Decoder, _ = X_Decoder.size() + X = torch.cat([X_Encoder,torch.tensor([self.Token_Sep_Encoder],device=X_Encoder.device).unsqueeze(0).repeat(BatchSize,1),X_Decoder],axis=1) + CutIndex=T_Encoder+1 + # T = X_Decoder.shape[1] + Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).unsqueeze(-2).repeat(1,1,T,1) + Mask_Cross = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).unsqueeze(-2) + mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T).to(Mask_Decoder.device) + Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + return Mask_Encoder,Mask_Decoder,Mask_Cross + + def forward(self, X_Encoder, X_Decoder, Y_Decoder_Ref=None,boundary=None): + Mask_Decoder,Mask_UTFMG,CutIndex = self.generate_masks(X_Encoder, X_Decoder) + # preprocess + X_Decoder = self.Dropout2(self.Embedding_Decoder(X_Decoder) * math.sqrt(self.Dim_Attention) + self.pos_emb(X_Decoder.size(1))) + #### Now X_Encoder: BatchSize, SequenceLength, DimAttention + # Decoder blocks + for decoder_block in self.decoder_blocks: + X_Decoder = decoder_block(X_Encoder,X_Decoder,Mask_UTFMG) + X_Decoder = self.LayerNorm2(X_Decoder) + Y_Decoder_Logits = self.head(X_Decoder[:,CutIndex:]) + loss = None + if Y_Decoder_Ref is not None: + loss = F.cross_entropy(Y_Decoder_Logits.view(-1, Y_Decoder_Logits.size(-1)), Y_Decoder_Ref.view(-1),ignore_index=self.Token_Padding_Decoder) + return Y_Decoder_Logits, loss + + # def generate_masks(self,X_Encoder, X_Decoder): + # # Generate encoder, decoder, cross masks + # Mask_Encoder = (X_Encoder != self.Token_Padding_Encoder).unsqueeze(-2).int().cpu() + # Mask_Decoder = (X_Decoder != self.Token_Padding_Decoder).unsqueeze(-2).int().cpu() + # Mask_Cross = Mask_Decoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Encoder = Mask_Encoder.unsqueeze(-1) @ Mask_Encoder.unsqueeze(-2) + # Mask_Decoder = Mask_Decoder.unsqueeze(-1) @ Mask_Decoder.unsqueeze(-2) + # T = X_Decoder.shape[1] + # mask_tril = torch.tril(torch.ones(T, T)).view(1, 1, T, T) + # Mask_Decoder = Mask_Decoder.masked_fill(mask_tril==0,0) + # Mask_Encoder = Mask_Encoder.to(X_Encoder.device) + # Mask_Decoder = Mask_Decoder.to(X_Decoder.device) + # Mask_Cross = Mask_Cross.to(X_Encoder.device) + # return Mask_Encoder,Mask_Decoder,Mask_Cross diff --git a/SCMG/models/UTFMG2/sampler.py b/SCMG/models/UTFMG2/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1c606302447534d425b50bbd15c153ea79895b65 --- /dev/null +++ b/SCMG/models/UTFMG2/sampler.py @@ -0,0 +1,85 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0, sample=False, top_k=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond) + logits = logits[:, -1, :] / temperature + if top_k is not None: + logits = top_k_logits(logits, top_k) + probs = F.softmax(logits, dim=-1) + if sample: + ix = torch.multinomial(probs, num_samples=1) + else: + _, ix = torch.topk(probs, k=1, dim=-1) + x = torch.cat((x, ix), dim=1) + + return x + + + + +@torch.no_grad() +def sample(model, x, steps, temperature=1.0,boundary=None): + block_size = model.get_block_size() + model.eval() + for k in range(steps): + x_cond = x if x.size(1) <= block_size else x[:, -block_size:] + logits, _ = model(x_cond,boundary=boundary) + logits = logits[:, -1, :] / temperature + probs = F.softmax(logits, dim=-1) + ix = torch.multinomial(probs, num_samples=1) + x = torch.cat((x, ix), dim=1) + return x + +'L_5*C(=O)NCc1cccc(OC)c1.*c1nsc2ccccc12COc1cccc(CNC(=O)c2cccc(NC(=O)c3nsc4ccccc34)c2)c1' + +# for i in range(1,21): +def sample_L(i,option='string'): + # i=2 + prefix = 'L_'+str(i) + string_input = prefix + '*O=C1NN=Cc2c1cccc2.*O=C(C1CC1)N1CCNCC1' + array_input = [vocab[a] for a in [''] + list(string_input)] + boundary = [len(array_input)] + tensor_input = torch.tensor(array_input,device='cuda').unsqueeze(0).repeat(32,1) + boundary = boundary*32 + tensor_output = sample(model,tensor_input,250,boundary=boundary) + strings_output = [] + for j in range(tensor_output.shape[0]): + list_string_output = [inv[a] for a in tensor_output[j,boundary[j]:].cpu().numpy() if a != vocab['']] + # if list_string_output[0] == '': + # list_string_output = list_string_output[1:] + if list_string_output[-1] == '': + list_string_output = list_string_output[:-1] + string_output = ''.join(list_string_output) + strings_output.append(string_output) + print(string_output) + for j in range(tensor_output.shape[0]): + if test_valid(strings_output[j]): + print(1) + else: + print(0) + + # logits,_ = model(tensor_input,boundary=boundary) + + +['', 'L', '_', '5', '*', 'C', '(', '=', 'O', ')', 'N', 'C', 'c', '1', 'c', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '1', '.', '*', 'c', '1', 'n', 's', 'c', '2', 'c', 'c', 'c', 'c', 'c', '1', '2', 'C', 'O', 'c', '1', 'c', 'c', 'c', 'c', '(', 'C', 'N', 'C', '(', '=', 'O', ')', 'c', '2', 'c', 'c', 'c', 'c', '(', 'N', 'C', '(', '=', 'O', ')', 'c', '3', 'n', 's', 'c', '4', 'c', 'c', 'c', 'c', 'c', '3', '4', ')', 'c', '2', ')', 'c', '1', ''] diff --git a/SCMG/models/__init__.py b/SCMG/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db3fd6af252368baf6ab44bee3baea027b107ff7 --- /dev/null +++ b/SCMG/models/__init__.py @@ -0,0 +1,2 @@ +from .Transformer import * + diff --git a/SCMG/models/__pycache__/__init__.cpython-310.pyc b/SCMG/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d314d7990bd847233d56c33a66a7972a9f836acc Binary files /dev/null and b/SCMG/models/__pycache__/__init__.cpython-310.pyc differ