ChatterjeeLab
/

FusOn-pLM

Model card Files Files and versions

svincoff commited on Jun 3, 2024

Commit

d120392

·

verified ·

1 Parent(s): e069ad9

Update README.md

Files changed (1) hide show

README.md +17 -2

README.md CHANGED Viewed

@@ -10,12 +10,22 @@ In this work, we introduce **FusOn-pLM**, a novel pLM that fine-tunes the state-
 ```
 from transformers import AutoTokenizer, AutoModel
 import torch
 # Load the tokenizer and model
 model_name = "ChatterjeeLab/FusOn-pLM"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name)
 # Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
 # Amino acids 1-80 are derived from the head gene, MLLT10
@@ -23,16 +33,21 @@ model = AutoModel.from_pretrained(model_name)
 sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
 # Tokenize the input sequence
-inputs = tokenizer(sequence, return_tensors="pt")
 # Get the embeddings
 with torch.no_grad():
     outputs = model(**inputs)
     # The embeddings are in the last_hidden_state tensor
     embeddings = outputs.last_hidden_state
 # Convert embeddings to numpy array (if needed)
-embeddings = embeddings.squeeze(0).numpy()
 print("Per-residue embeddings shape:", embeddings.shape)

 ```
 from transformers import AutoTokenizer, AutoModel
+import logging
 import torch
+# Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
+logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
 # Load the tokenizer and model
 model_name = "ChatterjeeLab/FusOn-pLM"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name)
+model.to(device)
+model.eval()
 # Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
 # Amino acids 1-80 are derived from the head gene, MLLT10
 sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
 # Tokenize the input sequence
+inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=2000)
+inputs = {k: v.to(device) for k, v in inputs.items()}
 # Get the embeddings
 with torch.no_grad():
     outputs = model(**inputs)
     # The embeddings are in the last_hidden_state tensor
     embeddings = outputs.last_hidden_state
+    # remove extra dimension
+    embeddings = embeddings.squeeze(0)
+    # remove BOS and EOS tokens
+    embeddings = embeddings[1:-1, :]
 # Convert embeddings to numpy array (if needed)
+embeddings = embeddings.numpy()
 print("Per-residue embeddings shape:", embeddings.shape)