maayanlab
/

gsfm

@@ -7,6 +7,10 @@ from huggingface_hub import PyTorchModelHubMixin, HfApi, hf_hub_download
 UNK_IDX, PAD_IDX = 0, 1
 special_symbols = ['<unk>', '<pad>']
 class Vocab:
   def __init__(self, vocab, default_index=0):
     self.vocab = vocab
@@ -91,21 +95,22 @@ class GSFM(
   PyTorchModelHubMixin,
   tags=["gene", "gene set", "bioinformatics"],
 ):
-  def __init__(self, vocab_size, d_model=256, depth=2):
     super().__init__()
     self.vocab_size = vocab_size
     self.d_model = d_model
     self.depth = depth
-    self.embedding = torch.nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
-    self.encoder = MLP(*[d_model**n for n in range(1, depth)], d_model)
-    self.decoder = MLP(d_model*2, *[d_model**n for n in range(2, depth)], vocab_size)
     self.save_hyperparameters()
   def encode(self, x):
-    x = emb = self.embedding(x)
-    x = enc = self.encoder(emb)
-    x = torch.cat([enc.mean(1), emb.mean(1)], -1)
-    return x
   def forward(self, x):
     x = self.encode(x)
@@ -113,12 +118,11 @@ class GSFM(
     return x
   def training_step(self, batch, batch_idx):
-    x, y = batch
-    is_x = torch.isnan(y)
-    y = torch.where(is_x, 0, y)
-    pos_weight = torch.where(is_x, 0, 1)
-    y_ = self(x)
-    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
     loss = criterion(y_, y)
     self.log('loss', loss, prog_bar=True)
     return loss

 UNK_IDX, PAD_IDX = 0, 1
 special_symbols = ['<unk>', '<pad>']
+def multihot_tensor(indices: torch.Tensor, num_classes: int, dtype=torch.int64, device=None):
+  *bs, _ = indices.shape
+  return torch.zeros((*bs, num_classes,), device=device, dtype=dtype).scatter(1, indices, 1)
 class Vocab:
   def __init__(self, vocab, default_index=0):
     self.vocab = vocab
   PyTorchModelHubMixin,
   tags=["gene", "gene set", "bioinformatics"],
 ):
+  def __init__(self, vocab_size, d_model=256, depth=2, dropout=0.2, partition=0, weighted_loss=None):
     super().__init__()
     self.vocab_size = vocab_size
     self.d_model = d_model
     self.depth = depth
+    self.dropout = dropout
+    self.partition = partition
+    self.weighted_loss = weighted_loss
+    self.encoder = MLP(vocab_size, *[d_model*(2**(n-1)) for n in range(depth, 1, -1)], d_model, dropout=dropout)
+    self.decoder = MLP(d_model, *[d_model*(2**(n-1)) for n in range(1, depth)], vocab_size, dropout=dropout)
     self.save_hyperparameters()
   def encode(self, x):
+    x = multihot_tensor(x, num_classes=self.vocab_size, device=self.device, dtype=torch.float)
+    x[:, PAD_IDX] = 0
+    return self.encoder(x)
   def forward(self, x):
     x = self.encode(x)
     return x
   def training_step(self, batch, batch_idx):
+    x_idx = y_idx = batch
+    y_ = self(x_idx)
+    y = multihot_tensor(y_idx, num_classes=self.vocab_size, device=self.device, dtype=torch.float)
+    y[:, PAD_IDX] = 0
+    criterion = torch.nn.BCEWithLogitsLoss()
     loss = criterion(y_, y)
     self.log('loss', loss, prog_bar=True)
     return loss