maayanlab
/

gsfm

+import torch
+import tempfile
+import pathlib
+import lightning as L
+from huggingface_hub import PyTorchModelHubMixin, HfApi, hf_hub_download
+UNK_IDX, PAD_IDX = 0, 1
+special_symbols = ['<unk>', '<pad>']
+class Vocab:
+  def __init__(self, vocab, default_index=0):
+    self.vocab = vocab
+    self.default_index = default_index
+    self.lookup = {token: i for i, token in enumerate(vocab)}
+  def __call__(self, sentence):
+    return [self.lookup.get(token, self.default_index) for token in sentence]
+  @staticmethod
+  def build_vocab_from_iterator(it, min_freq=1, specials=[], special_first=True):
+    vocab = []
+    if special_first:
+      vocab += specials
+    from collections import Counter
+    tokens = Counter()
+    for sentence in it:
+      tokens.update(sentence)
+    for token, freq in tokens.most_common():
+      if freq < min_freq: continue
+      vocab.append(token)
+    if not special_first:
+      vocab += specials
+    return Vocab(vocab)
+  def set_default_index(self, default_index):
+    self.default_index = default_index
+  def __len__(self):
+    return len(self.vocab)
+  def __reduce__(self):
+    return (Vocab, (self.vocab,))
+  def save_txt(self, filename):
+    with open(filename, 'w') as fw:
+      for token in self.vocab:
+        print(token, file=fw)
+  @staticmethod
+  def from_txt(filename):
+    with open(filename, 'r') as fr:
+      return Vocab([line for line in map(str.rstrip, fr) if line])
+  @staticmethod
+  def from_pretrained(repo_id: str, path_in_repo='vocab.txt'):
+    vocab_txt = hf_hub_download(
+      repo_id=repo_id,
+      filename=path_in_repo,
+    )
+    return Vocab.from_txt(vocab_txt)
+  def push_to_hub(self, repo_id: str, path_in_repo='vocab.txt'):
+    api = HfApi()
+    api.create_repo(repo_id, exist_ok=True)
+    with tempfile.TemporaryDirectory() as tmpdir:
+      tmpdir = pathlib.Path(tmpdir)
+      self.save_txt(tmpdir/'vocab.txt')
+      return api.upload_file(path_or_fileobj=tmpdir/'vocab.txt', repo_id=repo_id, path_in_repo=path_in_repo)
+class MLP(torch.nn.Module):
+  def __init__(self, *dims, activation=torch.nn.ReLU, dropout=0.2):
+    super().__init__()
+    activation = activation()
+    dropout = torch.nn.Dropout(dropout)
+    self.layers = torch.nn.ModuleList([
+      layer
+      for a, b in zip(dims, dims[1:])
+      for layer in (
+        torch.nn.Linear(a, b),
+        activation,
+        dropout,
+      )
+    ][:-2]) # the last layer doesn't need activation/dropout
+  def forward(self, x):
+    for layer in self.layers:
+      x = layer(x)
+    return x
+class GSFM(
+  L.LightningModule,
+  PyTorchModelHubMixin,
+  tags=["gene", "gene set", "bioinformatics"],
+):
+  def __init__(self, vocab_size, d_model=256, depth=2):
+    super().__init__()
+    self.vocab_size = vocab_size
+    self.d_model = d_model
+    self.depth = depth
+    self.embedding = torch.nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
+    self.encoder = MLP(*[d_model**n for n in range(1, depth)], d_model)
+    self.decoder = MLP(d_model*2, *[d_model**n for n in range(2, depth)], vocab_size)
+    self.save_hyperparameters()
+  def encode(self, x):
+    x = emb = self.embedding(x)
+    x = enc = self.encoder(emb)
+    x = torch.cat([enc.mean(1), emb.mean(1)], -1)
+    return x
+  def forward(self, x):
+    x = self.encode(x)
+    x = self.decoder(x)
+    return x
+  def training_step(self, batch, batch_idx):
+    x, y = batch
+    is_x = torch.isnan(y)
+    y = torch.where(is_x, 0, y)
+    pos_weight = torch.where(is_x, 0, 1)
+    y_ = self(x)
+    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    loss = criterion(y_, y)
+    self.log('loss', loss, prog_bar=True)
+    return loss
+  def validation_step(self, batch, batch_idx):
+    return self.training_step(batch, batch_idx)
+  def configure_optimizers(self):
+    optimizer = torch.optim.Adam(self.parameters())
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.25)
+    return [optimizer], [{
+        "scheduler": scheduler,
+        "monitor": "loss",
+        "frequency": 1,
+    }]