Update README.md
Browse files
README.md
CHANGED
|
@@ -10,12 +10,22 @@ In this work, we introduce **FusOn-pLM**, a novel pLM that fine-tunes the state-
|
|
| 10 |
|
| 11 |
```
|
| 12 |
from transformers import AutoTokenizer, AutoModel
|
|
|
|
| 13 |
import torch
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Load the tokenizer and model
|
| 16 |
model_name = "ChatterjeeLab/FusOn-pLM"
|
| 17 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 18 |
model = AutoModel.from_pretrained(model_name)
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
|
| 21 |
# Amino acids 1-80 are derived from the head gene, MLLT10
|
|
@@ -23,16 +33,21 @@ model = AutoModel.from_pretrained(model_name)
|
|
| 23 |
sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
|
| 24 |
|
| 25 |
# Tokenize the input sequence
|
| 26 |
-
inputs = tokenizer(sequence, return_tensors="pt")
|
|
|
|
| 27 |
|
| 28 |
# Get the embeddings
|
| 29 |
with torch.no_grad():
|
| 30 |
outputs = model(**inputs)
|
| 31 |
# The embeddings are in the last_hidden_state tensor
|
| 32 |
embeddings = outputs.last_hidden_state
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Convert embeddings to numpy array (if needed)
|
| 35 |
-
embeddings = embeddings.
|
| 36 |
|
| 37 |
print("Per-residue embeddings shape:", embeddings.shape)
|
| 38 |
|
|
|
|
| 10 |
|
| 11 |
```
|
| 12 |
from transformers import AutoTokenizer, AutoModel
|
| 13 |
+
import logging
|
| 14 |
import torch
|
| 15 |
|
| 16 |
+
# Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
|
| 17 |
+
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
|
| 18 |
+
|
| 19 |
+
# Set device
|
| 20 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 21 |
+
print(f"Using device: {device}")
|
| 22 |
+
|
| 23 |
# Load the tokenizer and model
|
| 24 |
model_name = "ChatterjeeLab/FusOn-pLM"
|
| 25 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 26 |
model = AutoModel.from_pretrained(model_name)
|
| 27 |
+
model.to(device)
|
| 28 |
+
model.eval()
|
| 29 |
|
| 30 |
# Example fusion oncoprotein sequence: MLLT10:PICALM, associated with Acute Myeloid Leukemia (LAML)
|
| 31 |
# Amino acids 1-80 are derived from the head gene, MLLT10
|
|
|
|
| 33 |
sequence = "MVSSDRPVSLEDEVSHSMKEMIGGCCVCSDERGWAENPLVYCDGHGCSVAVHQACYGIVQVPTGPWFCRKCESQERAARVPPQMGSVPVMTQPTLIYSQPVMRPPNPFGPVSGAQIQFM"
|
| 34 |
|
| 35 |
# Tokenize the input sequence
|
| 36 |
+
inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=2000)
|
| 37 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 38 |
|
| 39 |
# Get the embeddings
|
| 40 |
with torch.no_grad():
|
| 41 |
outputs = model(**inputs)
|
| 42 |
# The embeddings are in the last_hidden_state tensor
|
| 43 |
embeddings = outputs.last_hidden_state
|
| 44 |
+
# remove extra dimension
|
| 45 |
+
embeddings = embeddings.squeeze(0)
|
| 46 |
+
# remove BOS and EOS tokens
|
| 47 |
+
embeddings = embeddings[1:-1, :]
|
| 48 |
|
| 49 |
# Convert embeddings to numpy array (if needed)
|
| 50 |
+
embeddings = embeddings.numpy()
|
| 51 |
|
| 52 |
print("Per-residue embeddings shape:", embeddings.shape)
|
| 53 |
|