LucaGroup commited on
Commit
0243cb0
·
verified ·
1 Parent(s): a41afc8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +220 -27
README.md CHANGED
@@ -1,15 +1,16 @@
1
  ---
2
- license: apache-2.0
3
  tags:
4
- - LucaOne
5
- - Biological Foundation Model
6
- - Unified Nucleic Acid and Protein Language Model
7
- - Biology
8
- - AI4Science
9
- - AI4Biology
10
- - Bio
 
11
  language:
12
- - en
13
  ---
14
 
15
  # LucaOne/LucaGPLM
@@ -21,37 +22,229 @@ LucaOne/LucaGPLM - The LUCA Gene-Protein language model.
21
  You can install the package from source using pip:
22
 
23
  ```bash
24
- pip install lucagplm
25
  pip install tokenizers==0.19.1
26
  pip install transformers==4.41.2
27
  ```
28
 
29
  ## Usage
30
 
 
 
 
 
 
31
  ```python
32
- from lucagplm import LucaGPLMModel, LucaGPLMTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # Load model
35
- model = LucaGPLMModel.from_pretrained("LucaGroup/LucaOne-default-step36M")
36
- tokenizer = LucaGPLMTokenizer.from_pretrained("LucaGroup/LucaOne-default-step36M")
 
37
 
38
- # Example usage
39
- seq = "ATCG"
40
- # seq_type="gene", which includes DNA or RNA(Nucleotide Sequences)
41
- inputs = tokenizer(seq, seq_type="gene",return_tensors="pt")
42
- outputs = model(**inputs)
43
 
44
- print(outputs.last_hidden_state.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- seq = "NSQTA"
47
- inputs = tokenizer(seq, seq_type="prot",return_tensors="pt")
48
- outputs = model(**inputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- print(outputs.last_hidden_state.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  ```
52
 
53
- ## Github
54
- For long sequence embedding, please refer to the git repository:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- https://github.com/LucaOne/LucaOne
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
1
  ---
2
+ license: mit
3
  tags:
4
+ - LucaOne
5
+ - Biological Foundation Model
6
+ - Unified Nucleic Acid and Protein Language Model
7
+ - Biology
8
+ - AI4Science
9
+ - AI4Biology
10
+ - Bio
11
+ - 1.1.0
12
  language:
13
+ - en
14
  ---
15
 
16
  # LucaOne/LucaGPLM
 
22
  You can install the package from source using pip:
23
 
24
  ```bash
25
+ pip install lucaone==1.1.0
26
  pip install tokenizers==0.19.1
27
  pip install transformers==4.41.2
28
  ```
29
 
30
  ## Usage
31
 
32
+ Please refer to the `huggingface` branch of LucaOne: https://github.com/LucaOne/LucaOne.
33
+
34
+ ### 1. Feature Extraction/Embedding
35
+ Extract high-dimensional embeddings for downstream analysis or training downstream tasks using LucaOne-Embedding.
36
+
37
  ```python
38
+ import torch
39
+ import lucaone
40
+ from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
41
+
42
+ # model_id
43
+ model_id = "LucaGroup/LucaOne-default-step36M"
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(
46
+ model_id,
47
+ trust_remote_code=True,
48
+ force_download=True
49
+ )
50
+
51
+ model = AutoModel.from_pretrained(
52
+ model_id,
53
+ task_level="token_level",
54
+ task_type="embedding",
55
+ trust_remote_code=True,
56
+ force_download=True
57
+ )
58
+ print(model)
59
+ print("*" * 50)
60
 
61
+ # device
62
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
63
+ model.to(device)
64
+ model.eval()
65
 
66
+ # nucleotide sequence
67
+ nucleotide_sequence = "ATGCGTACGTTAGC"
68
+ print("Nucleotide sequence len: %d" % len(nucleotide_sequence))
 
 
69
 
70
+ # nucleotide sequence embedding
71
+ print("Processing Nucleotide Sequence...")
72
+ nucleotide_inputs = tokenizer(
73
+ nucleotide_sequence,
74
+ # note: gene sequence(for DNA or RNA)
75
+ seq_type="gene",
76
+ return_tensors="pt",
77
+ add_special_tokens=True
78
+ )
79
+ new_nucleotide_inputs = {}
80
+ for item in nucleotide_inputs.items():
81
+ new_nucleotide_inputs[item[0]] = item[1].to(device)
82
+ nucleotide_inputs = new_nucleotide_inputs
83
+ print("Nucleotide inputs:")
84
+ print(nucleotide_inputs)
85
 
86
+ with torch.no_grad():
87
+ nucleotide_outputs = model(**nucleotide_inputs)
88
+ # last hidden matrix as embedding matrix: [batch_size, seq_len + 2, hidden_size]
89
+ nucleotide_last_hidden = nucleotide_outputs.last_hidden_state
90
+ # mean pooling
91
+ mean_nucleotide_embedding = nucleotide_last_hidden[0, 1:-1, :].mean(dim=1)
92
+ # cls pooling
93
+ cls_nucleotide_embedding = nucleotide_last_hidden[0, 0, :]
94
+ print(f"Nucleotide Embedding Shape: {nucleotide_last_hidden.shape}")
95
+ print("Nucleotide Embedding(Matrix, Include [CLS] and [SEP]):")
96
+ print(nucleotide_last_hidden)
97
+ print("Nucleotide Embedding(Mean Pooling Vector):")
98
+ print(mean_nucleotide_embedding)
99
+ print("Nucleotide Embedding(CLS Pooling Vector):")
100
+ print(cls_nucleotide_embedding)
101
+ print("*" * 50)
102
 
103
+ # Protein Sequence
104
+ protein_sequence = "MKTLLILTAVVLL"
105
+ print("Protein sequence len: %d" % len(nucleotide_sequence))
106
+
107
+ print("Processing Protein Sequence...")
108
+ prot_inputs = tokenizer(
109
+ protein_sequence,
110
+ # note: protein sequence
111
+ seq_type="prot",
112
+ return_tensors="pt",
113
+ add_special_tokens=True
114
+ )
115
+ new_prot_inputs = {}
116
+ for item in prot_inputs.items():
117
+ new_prot_inputs[item[0]] = item[1].to(device)
118
+ prot_inputs = new_prot_inputs
119
+ print("Protein inputs:")
120
+ print(prot_inputs)
121
+
122
+ with torch.no_grad():
123
+ prot_outputs = model(**prot_inputs)
124
+ # last hidden matrix as embedding matrix: [batch_size, seq_len + 2, hidden_size]
125
+ prot_last_hidden = prot_outputs.last_hidden_state
126
+ # mean pooling
127
+ mean_prot_embedding = prot_last_hidden[:, 1:-1, :].mean(dim=1)
128
+ # cls pooling
129
+ cls_prot_embedding = prot_last_hidden[:, 0, :]
130
+ print(f"Protein Embedding Shape: {prot_last_hidden.shape}")
131
+ print("Protein Embedding(Matrix, Include [CLS] and [SEP]):")
132
+ print(prot_last_hidden)
133
+ print("Protein Embedding(Mean Pooling Vector):")
134
+ print(mean_prot_embedding)
135
+ print("Protein Embedding(CLS Pooling Vector):")
136
+ print(cls_prot_embedding)
137
+ print("*" * 50)
138
  ```
139
 
140
+ ### 2. MLM Pre-training and Sequence Recovery
141
+ Continue to perform MLM pre-training or sequence recovery.
142
+
143
+ ```python
144
+ import torch
145
+ import lucaone
146
+ from datasets import Dataset
147
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
148
+
149
+ # model_id
150
+ model_id = "LucaGroup/LucaOne-default-step36M"
151
+
152
+ model = AutoModelForMaskedLM.from_pretrained(
153
+ model_id,
154
+ trust_remote_code=True,
155
+ force_download=True
156
+ )
157
+
158
+ tokenizer = AutoTokenizer.from_pretrained(
159
+ model_id,
160
+ trust_remote_code=True,
161
+ force_download=True
162
+ )
163
+ print(model)
164
+ print("*" * 50)
165
+
166
+ # finetune all parameters
167
+ for param in model.parameters():
168
+ param.requires_grad = True
169
+
170
+ # create dataset and trainer for training...
171
+ ```
172
+ ### 3. Sequence Classification
173
+ Predict properties for the entire sequence (e.g., Enzyme vs. Non-Enzyme).
174
+
175
+ Supports `multi-class classification`, `binary classification`, `multi-label classification`, and `regression` tasks.
176
 
177
+ ```python
178
+ import torch
179
+ import lucaone
180
+ from datasets import Dataset
181
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
182
+
183
+ # model_id
184
+ model_id = "LucaGroup/LucaOne-default-step36M"
185
+
186
+ model = AutoModelForSequenceClassification.from_pretrained(
187
+ model_id,
188
+ task_level="seq_level",
189
+ task_type="multi_class",
190
+ classifier_num_labels=4,
191
+ trust_remote_code=True,
192
+ force_download=True
193
+ )
194
+
195
+ tokenizer = AutoTokenizer.from_pretrained(
196
+ model_id,
197
+ trust_remote_code=True,
198
+ force_download=True
199
+ )
200
+ print(model)
201
+ print("*" * 50)
202
+
203
+ # finetune all parameters
204
+ for param in model.parameters():
205
+ param.requires_grad = True
206
+
207
+ # create dataset and trainer for training...
208
+ ```
209
+ ### 4. Token Classification
210
+ Predict properties for each residue/nucleotide (e.g., Secondary Structure, Binding Sites, and , Post-Translational Modifications).
211
+
212
+ Supports `multi-class classification`, `binary classification`, `multi-label classification`, and `regression` tasks.
213
+
214
+ ```python
215
+ import torch
216
+ import lucaone
217
+ from datasets import Dataset
218
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
219
+
220
+ # model_id
221
+ model_id = "LucaGroup/LucaOne-default-step36M"
222
+
223
+ model = AutoModelForTokenClassification.from_pretrained(
224
+ model_id,
225
+ task_level="token_level",
226
+ task_type="binary_class",
227
+ classifier_num_labels=2,
228
+ trust_remote_code=True,
229
+ force_download=True
230
+ )
231
+
232
+ tokenizer = AutoTokenizer.from_pretrained(
233
+ model_id,
234
+ trust_remote_code=True,
235
+ force_download=True
236
+ )
237
+ print(model)
238
+ print("*" * 50)
239
+
240
+ # finetune all parameters
241
+ for param in model.parameters():
242
+ param.requires_grad = True
243
+
244
+ # create dataset and trainer for training...
245
+ ```
246
+ ## Github
247
+ For long sequence embedding or using LucaOne for downstream tasks, please refer to the git repository:
248
 
249
+ https://github.com/LucaOne/LucaOne,
250
+ https://github.com/LucaOne/LucaOneTaks