JadenLong
/

MutBERT-Multi

Feature Extraction

bioRxiv 2025.01.23.634452

Model card Files Files and versions

JadenLong commited on Apr 2, 2025

Commit

969e58b

·

verified ·

1 Parent(s): 9477432

Update README.md

Files changed (1) hide show

README.md +6 -4

README.md CHANGED Viewed

@@ -29,7 +29,7 @@ MutBERT is a transformer-based genome foundation model trained only on Human gen
 ```python
 from transformers import AutoTokenizer, AutoModel
-model_name = "JadenLong/MutBERT"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 ```
@@ -52,7 +52,7 @@ dna = "ATCGGGGCCCATTA"
 inputs = tokenizer(dna, return_tensors='pt')["input_ids"]
 mut_inputs = F.one_hot(inputs, num_classes=len(tokenizer)).float().to("cpu")  # len(tokenizer) is vocab size
-last_hidden_state = model(inputs).last_hidden_state   # [1, sequence_length, 768]
 # or: last_hidden_state = model(mut_inputs)[0]        # [1, sequence_length, 768]
 # embedding with mean pooling
@@ -60,8 +60,9 @@ embedding_mean = torch.mean(last_hidden_state[0], dim=0)
 print(embedding_mean.shape) # expect to be 768
 # embedding with max pooling
-embedding_max = torch.max(hidden_states[0], dim=0)[0]
 print(embedding_max.shape) # expect to be 768
 ```
 ### Using as a Classifier
@@ -69,7 +70,7 @@ print(embedding_max.shape) # expect to be 768
 ```python
 from transformers import AutoModelForSequenceClassification
-model_name = "JadenLong/MutBERT"
 model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, num_labels=2)
 ```
@@ -80,6 +81,7 @@ Allowed types for RoPE scaling are: `linear` and `dynamic`. To extend the model'
 If you want to scale your model context by 2x:
 ```python
 model = AutoModel.from_pretrained(model_name,
                                   trust_remote_code=True,
                                   rope_scaling={'type': 'dynamic','factor': 2.0}

 ```python
 from transformers import AutoTokenizer, AutoModel
+model_name = "JadenLong/MutBERT-Multi"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 ```
 inputs = tokenizer(dna, return_tensors='pt')["input_ids"]
 mut_inputs = F.one_hot(inputs, num_classes=len(tokenizer)).float().to("cpu")  # len(tokenizer) is vocab size
+last_hidden_state = model(mut_inputs).last_hidden_state   # [1, sequence_length, 768]
 # or: last_hidden_state = model(mut_inputs)[0]        # [1, sequence_length, 768]
 # embedding with mean pooling
 print(embedding_mean.shape) # expect to be 768
 # embedding with max pooling
+embedding_max = torch.max(last_hidden_state[0], dim=0)[0]
 print(embedding_max.shape) # expect to be 768
 ```
 ### Using as a Classifier
 ```python
 from transformers import AutoModelForSequenceClassification
+model_name = "JadenLong/MutBERT-Multi"
 model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, num_labels=2)
 ```
 If you want to scale your model context by 2x:
 ```python
+model_name = "JadenLong/MutBERT-Multi"
 model = AutoModel.from_pretrained(model_name,
                                   trust_remote_code=True,
                                   rope_scaling={'type': 'dynamic','factor': 2.0}