File size: 6,681 Bytes
7ddf5e8 0243cb0 136fc4a 0243cb0 28e914a 136fc4a 0243cb0 7ddf5e8 efde32a 7ddf5e8 efde32a 7ddf5e8 a5c5d93 dcb0e36 7ddf5e8 0243cb0 7ddf5e8 0243cb0 7ddf5e8 0243cb0 7ddf5e8 0243cb0 7ddf5e8 0243cb0 54e84f6 0243cb0 7ddf5e8 0243cb0 7ddf5e8 5d8561e 0243cb0 8ffc4a6 0243cb0 5d8561e 0243cb0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
---
license: mit
tags:
- LucaOne
- Biological Foundation Model
- Unified Nucleic Acid and Protein Language Model
- Biology
- AI4Science
- AI4Biology
- Bio
- 1.1.1
language:
- en
---
# LucaOne/LucaGPLM
LucaOne/LucaGPLM - The LUCA Gene-Protein language model.
## Installation
You can install the package from source using pip:
```bash
pip install lucaone==1.1.1
pip install tokenizers==0.19.1
pip install transformers==4.41.2
```
## Usage
Please refer to the `huggingface` branch of LucaOne: https://github.com/LucaOne/LucaOne.
### 1. Feature Extraction/Embedding
Extract high-dimensional embeddings for downstream analysis or training downstream tasks using LucaOne-Embedding.
```python
import torch
import lucaone
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
# model_id
model_id = "LucaGroup/LucaOne-default-step36M"
tokenizer = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
force_download=True
)
model = AutoModel.from_pretrained(
model_id,
task_level="token_level",
task_type="embedding",
trust_remote_code=True,
force_download=True
)
print(model)
print("*" * 50)
# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# nucleotide sequence
nucleotide_sequence = "ATGCGTACGTTAGC"
print("Nucleotide sequence len: %d" % len(nucleotide_sequence))
# nucleotide sequence embedding
print("Processing Nucleotide Sequence...")
nucleotide_inputs = tokenizer(
nucleotide_sequence,
# note: gene sequence(for DNA or RNA)
seq_type="gene",
return_tensors="pt",
add_special_tokens=True
)
new_nucleotide_inputs = {}
for item in nucleotide_inputs.items():
new_nucleotide_inputs[item[0]] = item[1].to(device)
nucleotide_inputs = new_nucleotide_inputs
print("Nucleotide inputs:")
print(nucleotide_inputs)
with torch.no_grad():
nucleotide_outputs = model(**nucleotide_inputs)
# last hidden matrix as embedding matrix: [batch_size, seq_len + 2, hidden_size]
nucleotide_last_hidden = nucleotide_outputs.last_hidden_state
# mean pooling
mean_nucleotide_embedding = nucleotide_last_hidden[0, 1:-1, :].mean(dim=1)
# cls pooling
cls_nucleotide_embedding = nucleotide_last_hidden[0, 0, :]
print(f"Nucleotide Embedding Shape: {nucleotide_last_hidden.shape}")
print("Nucleotide Embedding(Matrix, Include [CLS] and [SEP]):")
print(nucleotide_last_hidden)
print("Nucleotide Embedding(Mean Pooling Vector):")
print(mean_nucleotide_embedding)
print("Nucleotide Embedding(CLS Pooling Vector):")
print(cls_nucleotide_embedding)
print("*" * 50)
# Protein Sequence
protein_sequence = "MKTLLILTAVVLL"
print("Protein sequence len: %d" % len(nucleotide_sequence))
print("Processing Protein Sequence...")
prot_inputs = tokenizer(
protein_sequence,
# note: protein sequence
seq_type="prot",
return_tensors="pt",
add_special_tokens=True
)
new_prot_inputs = {}
for item in prot_inputs.items():
new_prot_inputs[item[0]] = item[1].to(device)
prot_inputs = new_prot_inputs
print("Protein inputs:")
print(prot_inputs)
with torch.no_grad():
prot_outputs = model(**prot_inputs)
# last hidden matrix as embedding matrix: [batch_size, seq_len + 2, hidden_size]
prot_last_hidden = prot_outputs.last_hidden_state
# mean pooling
mean_prot_embedding = prot_last_hidden[:, 1:-1, :].mean(dim=1)
# cls pooling
cls_prot_embedding = prot_last_hidden[:, 0, :]
print(f"Protein Embedding Shape: {prot_last_hidden.shape}")
print("Protein Embedding(Matrix, Include [CLS] and [SEP]):")
print(prot_last_hidden)
print("Protein Embedding(Mean Pooling Vector):")
print(mean_prot_embedding)
print("Protein Embedding(CLS Pooling Vector):")
print(cls_prot_embedding)
print("*" * 50)
```
### 2. MLM Pre-training and Sequence Recovery
Continue to perform MLM pre-training or sequence recovery.
```python
import torch
import lucaone
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
# model_id
model_id = "LucaGroup/LucaOne-default-step36M"
model = AutoModelForMaskedLM.from_pretrained(
model_id,
trust_remote_code=True,
force_download=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
force_download=True
)
print(model)
print("*" * 50)
# finetune all parameters
for param in model.parameters():
param.requires_grad = True
# create dataset and trainer for training...
```
### 3. Sequence Classification
Predict properties for the entire sequence (e.g., Enzyme vs. Non-Enzyme).
Supports `multi-class classification`, `binary classification`, `multi-label classification`, and `regression` tasks.
```python
import torch
import lucaone
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# model_id
model_id = "LucaGroup/LucaOne-default-step36M"
model = AutoModelForSequenceClassification.from_pretrained(
model_id,
task_level="seq_level",
task_type="multi_class",
classifier_num_labels=4,
trust_remote_code=True,
force_download=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
force_download=True
)
print(model)
print("*" * 50)
# finetune all parameters
for param in model.parameters():
param.requires_grad = True
# create dataset and trainer for training...
```
### 4. Token Classification
Predict properties for each residue/nucleotide (e.g., Secondary Structure, Binding Sites, and , Post-Translational Modifications).
Supports `multi-class classification`, `binary classification`, `multi-label classification`, and `regression` tasks.
```python
import torch
import lucaone
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
# model_id
model_id = "LucaGroup/LucaOne-default-step36M"
model = AutoModelForTokenClassification.from_pretrained(
model_id,
task_level="token_level",
task_type="binary_class",
classifier_num_labels=2,
trust_remote_code=True,
force_download=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_id,
trust_remote_code=True,
force_download=True
)
print(model)
print("*" * 50)
# finetune all parameters
for param in model.parameters():
param.requires_grad = True
# create dataset and trainer for training...
```
## Github
For long sequence embedding or using LucaOne for downstream tasks, please refer to the git repository:
https://github.com/LucaOne/LucaOne,
https://github.com/LucaOne/LucaOneTaks |