Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- data/protein-text/eval_assist.zipltqrmb6_.tmp +3 -0
- llm_model/model-00003-of-00004.safetensors +3 -0
- model/__pycache__/blip2.cpython-310.pyc +0 -0
- model/__pycache__/blip2.cpython-311.pyc +0 -0
- model/__pycache__/blip2.cpython-38.pyc +0 -0
- model/__pycache__/blip2_opt.cpython-310.pyc +0 -0
- model/__pycache__/blip2_opt.cpython-311.pyc +0 -0
- model/__pycache__/blip2_stage1.cpython-310.pyc +0 -0
- model/__pycache__/blip2_stage1.cpython-311.pyc +0 -0
- model/__pycache__/blip2_stage1.cpython-38.pyc +0 -0
- model/__pycache__/blip2_stage2.cpython-310.pyc +0 -0
- model/__pycache__/blip2_stage3.cpython-310.pyc +0 -0
- model/__pycache__/blip2_stage3.cpython-311.pyc +0 -0
- model/__pycache__/blip2qformer.cpython-310.pyc +0 -0
- model/__pycache__/blip2qformer.cpython-311.pyc +0 -0
- model/__pycache__/blip2qformer.cpython-38.pyc +0 -0
- model/__pycache__/dist_funs.cpython-310.pyc +0 -0
- model/__pycache__/help_funcs.cpython-310.pyc +0 -0
- model/__pycache__/opt_flash_attention.cpython-310.pyc +0 -0
- model/__pycache__/opt_flash_attention.cpython-311.pyc +0 -0
- model/__pycache__/opt_flash_attention.cpython-313.pyc +0 -0
- model/test.py +34 -0
- plm_model/esm2-150m/.gitattributes +33 -0
- plm_model/esm2-150m/README.md +20 -0
- plm_model/esm2-150m/config.json +30 -0
- plm_model/esm2-150m/model.safetensors +3 -0
- plm_model/esm2-150m/pytorch_model.bin +3 -0
- plm_model/esm2-150m/special_tokens_map.json +7 -0
- plm_model/esm2-150m/tf_model.h5 +3 -0
- plm_model/esm2-150m/tokenizer_config.json +4 -0
- plm_model/esm2-150m/vocab.txt +33 -0
- plm_model/microsoft/.gitattributes +9 -0
- plm_model/microsoft/LICENSE.md +21 -0
- plm_model/microsoft/README.md +38 -0
- plm_model/microsoft/config.json +17 -0
- plm_model/microsoft/flax_model.msgpack +3 -0
- plm_model/microsoft/pytorch_model.bin +3 -0
- plm_model/microsoft/tokenizer_config.json +3 -0
- plm_model/microsoft/vocab.txt +0 -0
- results/2datasets_construct_predictions1.txt +0 -0
- results/2datasets_qweninstruct_predictions.txt +0 -0
- results/aav_07252307_predictions.txt +0 -0
- results/ablation_deeplocbinary_predictions.txt +0 -0
- results/ablation_fluorescence_predictions.txt +0 -0
- results/ablation_gb1.0_predictions.txt +0 -0
- results/ablation_gb1_predictions.txt +0 -0
- results/ablation_material_predictions.txt +0 -0
- results/ablation_metallonbinding_predictions.txt +0 -0
- results/antibiotic_07262045_predictions.txt +0 -0
.gitattributes
CHANGED
|
@@ -54,3 +54,4 @@ data_small/OntoProteinDatasetV2/train.txt filter=lfs diff=lfs merge=lfs -text
|
|
| 54 |
data_small/PDBDataset/abstract.json filter=lfs diff=lfs merge=lfs -text
|
| 55 |
data_small/PDBDataset/qa_all.json filter=lfs diff=lfs merge=lfs -text
|
| 56 |
data_small/SwissProtV3/train_set_.jsonl filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 54 |
data_small/PDBDataset/abstract.json filter=lfs diff=lfs merge=lfs -text
|
| 55 |
data_small/PDBDataset/qa_all.json filter=lfs diff=lfs merge=lfs -text
|
| 56 |
data_small/SwissProtV3/train_set_.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
data/protein-text/eval_assist.zipltqrmb6_.tmp filter=lfs diff=lfs merge=lfs -text
|
data/protein-text/eval_assist.zipltqrmb6_.tmp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6092fb4729236af66e921aff8d7330012bc0eb1428240856251cd14ed23b45e9
|
| 3 |
+
size 8291512692
|
llm_model/model-00003-of-00004.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5e22df05cdc66407dd05e130ce0e47ddf56e2d3ee6ef6eef9249b07ecb4e9d7
|
| 3 |
+
size 4333083488
|
model/__pycache__/blip2.cpython-310.pyc
ADDED
|
Binary file (3.17 kB). View file
|
|
|
model/__pycache__/blip2.cpython-311.pyc
ADDED
|
Binary file (5.11 kB). View file
|
|
|
model/__pycache__/blip2.cpython-38.pyc
ADDED
|
Binary file (3.11 kB). View file
|
|
|
model/__pycache__/blip2_opt.cpython-310.pyc
ADDED
|
Binary file (9.74 kB). View file
|
|
|
model/__pycache__/blip2_opt.cpython-311.pyc
ADDED
|
Binary file (18.8 kB). View file
|
|
|
model/__pycache__/blip2_stage1.cpython-310.pyc
ADDED
|
Binary file (11.9 kB). View file
|
|
|
model/__pycache__/blip2_stage1.cpython-311.pyc
ADDED
|
Binary file (33.7 kB). View file
|
|
|
model/__pycache__/blip2_stage1.cpython-38.pyc
ADDED
|
Binary file (13.7 kB). View file
|
|
|
model/__pycache__/blip2_stage2.cpython-310.pyc
ADDED
|
Binary file (14.1 kB). View file
|
|
|
model/__pycache__/blip2_stage3.cpython-310.pyc
ADDED
|
Binary file (10.9 kB). View file
|
|
|
model/__pycache__/blip2_stage3.cpython-311.pyc
ADDED
|
Binary file (21.1 kB). View file
|
|
|
model/__pycache__/blip2qformer.cpython-310.pyc
ADDED
|
Binary file (7.54 kB). View file
|
|
|
model/__pycache__/blip2qformer.cpython-311.pyc
ADDED
|
Binary file (15.8 kB). View file
|
|
|
model/__pycache__/blip2qformer.cpython-38.pyc
ADDED
|
Binary file (7.46 kB). View file
|
|
|
model/__pycache__/dist_funs.cpython-310.pyc
ADDED
|
Binary file (6.71 kB). View file
|
|
|
model/__pycache__/help_funcs.cpython-310.pyc
ADDED
|
Binary file (3.96 kB). View file
|
|
|
model/__pycache__/opt_flash_attention.cpython-310.pyc
ADDED
|
Binary file (7.2 kB). View file
|
|
|
model/__pycache__/opt_flash_attention.cpython-311.pyc
ADDED
|
Binary file (15.8 kB). View file
|
|
|
model/__pycache__/opt_flash_attention.cpython-313.pyc
ADDED
|
Binary file (15.2 kB). View file
|
|
|
model/test.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 2 |
+
import torch
|
| 3 |
+
llm_tokenizer = AutoTokenizer.from_pretrained("/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", use_fast=False, padding_side='right')
|
| 4 |
+
llm_tokenizer.add_special_tokens({'pad_token': '<pad>'})
|
| 5 |
+
|
| 6 |
+
llm_model = AutoModelForCausalLM.from_pretrained("/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", torch_dtype=torch.bfloat16)
|
| 7 |
+
llm_model.resize_token_embeddings(len(llm_tokenizer))
|
| 8 |
+
|
| 9 |
+
text = "You need to answer the following question directly, which means you can only give the number of the option in the answer. For example: <ANSWER>option number</ANSWER> Based on the following protein\'s amino acid sequence, is the protein located on the membrane? Swiss-Prot description for P86987,Options:\n0.Yes\n1.No"
|
| 10 |
+
|
| 11 |
+
# Step 1: 编码成 input_ids 和 attention_mask(注意要 tensor 类型)
|
| 12 |
+
inputs = llm_tokenizer(text, return_tensors="pt")
|
| 13 |
+
|
| 14 |
+
input_ids = inputs["input_ids"] # shape: [1, L]
|
| 15 |
+
attention_mask = inputs["attention_mask"] # shape: [1, L]
|
| 16 |
+
|
| 17 |
+
# Step 2: 通过模型 embedding 层获取 inputs_embeds
|
| 18 |
+
with torch.no_grad():
|
| 19 |
+
inputs_embeds = llm_model.get_input_embeddings()(input_ids)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
outputs = llm_model.generate(
|
| 23 |
+
inputs_embeds=inputs_embeds,
|
| 24 |
+
attention_mask=attention_mask,
|
| 25 |
+
|
| 26 |
+
max_length=128,
|
| 27 |
+
min_length=1,
|
| 28 |
+
|
| 29 |
+
use_cache=True,
|
| 30 |
+
cache_implementation="hybrid"
|
| 31 |
+
)
|
| 32 |
+
output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
| 33 |
+
output_text = [text.strip() for text in output_text]
|
| 34 |
+
print(output_text)
|
plm_model/esm2-150m/.gitattributes
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
plm_model/esm2-150m/README.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
widget:
|
| 4 |
+
- text: "MQIFVKTLTGKTITLEVEPS<mask>TIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## ESM-2
|
| 8 |
+
|
| 9 |
+
ESM-2 is a state-of-the-art protein model trained on a masked language modelling objective. It is suitable for fine-tuning on a wide range of tasks that take protein sequences as input. For detailed information on the model architecture and training data, please refer to the [accompanying paper](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2). You may also be interested in some demo notebooks ([PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb), [TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb)) which demonstrate how to fine-tune ESM-2 models on your tasks of interest.
|
| 10 |
+
|
| 11 |
+
Several ESM-2 checkpoints are available in the Hub with varying sizes. Larger sizes generally have somewhat better accuracy, but require much more memory and time to train:
|
| 12 |
+
|
| 13 |
+
| Checkpoint name | Num layers | Num parameters |
|
| 14 |
+
|------------------------------|----|----------|
|
| 15 |
+
| [esm2_t48_15B_UR50D](https://huggingface.co/facebook/esm2_t48_15B_UR50D) | 48 | 15B |
|
| 16 |
+
| [esm2_t36_3B_UR50D](https://huggingface.co/facebook/esm2_t36_3B_UR50D) | 36 | 3B |
|
| 17 |
+
| [esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D) | 33 | 650M |
|
| 18 |
+
| [esm2_t30_150M_UR50D](https://huggingface.co/facebook/esm2_t30_150M_UR50D) | 30 | 150M |
|
| 19 |
+
| [esm2_t12_35M_UR50D](https://huggingface.co/facebook/esm2_t12_35M_UR50D) | 12 | 35M |
|
| 20 |
+
| [esm2_t6_8M_UR50D](https://huggingface.co/facebook/esm2_t6_8M_UR50D) | 6 | 8M |
|
plm_model/esm2-150m/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/tmp/facebook/esm2_t30_150M_UR50D",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"EsmForMaskedLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.0,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"emb_layer_norm_before": false,
|
| 9 |
+
"esmfold_config": null,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.0,
|
| 12 |
+
"hidden_size": 640,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 2560,
|
| 15 |
+
"is_folding_model": false,
|
| 16 |
+
"layer_norm_eps": 1e-05,
|
| 17 |
+
"mask_token_id": 32,
|
| 18 |
+
"max_position_embeddings": 1026,
|
| 19 |
+
"model_type": "esm",
|
| 20 |
+
"num_attention_heads": 20,
|
| 21 |
+
"num_hidden_layers": 30,
|
| 22 |
+
"pad_token_id": 1,
|
| 23 |
+
"position_embedding_type": "rotary",
|
| 24 |
+
"token_dropout": true,
|
| 25 |
+
"torch_dtype": "float32",
|
| 26 |
+
"transformers_version": "4.25.0.dev0",
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"vocab_list": null,
|
| 29 |
+
"vocab_size": 33
|
| 30 |
+
}
|
plm_model/esm2-150m/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3f1da8aea53bddd32c246c86168c23b9fd72341fb9db9a94436f855f5053566
|
| 3 |
+
size 595257706
|
plm_model/esm2-150m/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a88feb574b9f4e31c45762961d1b2ddba95796db86fb480d207b4d15e6ec8aab
|
| 3 |
+
size 595364077
|
plm_model/esm2-150m/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "<cls>",
|
| 3 |
+
"eos_token": "<eos>",
|
| 4 |
+
"mask_token": "<mask>",
|
| 5 |
+
"pad_token": "<pad>",
|
| 6 |
+
"unk_token": "<unk>"
|
| 7 |
+
}
|
plm_model/esm2-150m/tf_model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:78b04a25f0ded4bd3a7665d4589863984117384b239cee6de97c46aefec4a1b5
|
| 3 |
+
size 593355136
|
plm_model/esm2-150m/tokenizer_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 3 |
+
"tokenizer_class": "EsmTokenizer"
|
| 4 |
+
}
|
plm_model/esm2-150m/vocab.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<cls>
|
| 2 |
+
<pad>
|
| 3 |
+
<eos>
|
| 4 |
+
<unk>
|
| 5 |
+
L
|
| 6 |
+
A
|
| 7 |
+
G
|
| 8 |
+
V
|
| 9 |
+
S
|
| 10 |
+
E
|
| 11 |
+
R
|
| 12 |
+
T
|
| 13 |
+
I
|
| 14 |
+
D
|
| 15 |
+
P
|
| 16 |
+
K
|
| 17 |
+
Q
|
| 18 |
+
N
|
| 19 |
+
F
|
| 20 |
+
Y
|
| 21 |
+
M
|
| 22 |
+
H
|
| 23 |
+
W
|
| 24 |
+
C
|
| 25 |
+
X
|
| 26 |
+
B
|
| 27 |
+
U
|
| 28 |
+
Z
|
| 29 |
+
O
|
| 30 |
+
.
|
| 31 |
+
-
|
| 32 |
+
<null_1>
|
| 33 |
+
<mask>
|
plm_model/microsoft/.gitattributes
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
plm_model/microsoft/LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) Microsoft Corporation
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
| 16 |
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
| 17 |
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
| 18 |
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
|
| 19 |
+
DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
| 20 |
+
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
|
| 21 |
+
OR OTHER DEALINGS IN THE SOFTWARE.
|
plm_model/microsoft/README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
tags:
|
| 4 |
+
- exbert
|
| 5 |
+
license: mit
|
| 6 |
+
widget:
|
| 7 |
+
- text: "[MASK] is a tumor suppressor gene."
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## MSR BiomedBERT (abstracts + full text)
|
| 11 |
+
|
| 12 |
+
<div style="border: 2px solid orange; border-radius:10px; padding:0px 10px; width: fit-content;">
|
| 13 |
+
|
| 14 |
+
* This model was previously named **"PubMedBERT (abstracts + full text)"**.
|
| 15 |
+
* You can either adopt the new model name "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext" or update your `transformers` library to version 4.22+ if you need to refer to the old name.
|
| 16 |
+
|
| 17 |
+
</div>
|
| 18 |
+
|
| 19 |
+
Pretraining large neural language models, such as BERT, has led to impressive gains on many natural language processing (NLP) tasks. However, most pretraining efforts focus on general domain corpora, such as newswire and Web. A prevailing assumption is that even domain-specific pretraining can benefit by starting from general-domain language models. [Recent work](https://arxiv.org/abs/2007.15779) shows that for domains with abundant unlabeled text, such as biomedicine, pretraining language models from scratch results in substantial gains over continual pretraining of general-domain language models.
|
| 20 |
+
|
| 21 |
+
BiomedBERT is pretrained from scratch using _abstracts_ from [PubMed](https://pubmed.ncbi.nlm.nih.gov/) and _full-text_ articles from [PubMedCentral](https://www.ncbi.nlm.nih.gov/pmc/). This model achieves state-of-the-art performance on many biomedical NLP tasks, and currently holds the top score on the [Biomedical Language Understanding and Reasoning Benchmark](https://aka.ms/BLURB).
|
| 22 |
+
|
| 23 |
+
## Citation
|
| 24 |
+
|
| 25 |
+
If you find BiomedBERT useful in your research, please cite the following paper:
|
| 26 |
+
|
| 27 |
+
```latex
|
| 28 |
+
@misc{pubmedbert,
|
| 29 |
+
author = {Yu Gu and Robert Tinn and Hao Cheng and Michael Lucas and Naoto Usuyama and Xiaodong Liu and Tristan Naumann and Jianfeng Gao and Hoifung Poon},
|
| 30 |
+
title = {Domain-Specific Language Model Pretraining for Biomedical Natural Language Processing},
|
| 31 |
+
year = {2020},
|
| 32 |
+
eprint = {arXiv:2007.15779},
|
| 33 |
+
}
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
<a href="https://huggingface.co/exbert/?model=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext&modelKind=bidirectional&sentence=Gefitinib%20is%20an%20EGFR%20tyrosine%20kinase%20inhibitor,%20which%20is%20often%20used%20for%20breast%20cancer%20and%20NSCLC%20treatment.&layer=3&heads=..0,1,2,3,4,5,6,7,8,9,10,11&threshold=0.7&tokenInd=17&tokenSide=right&maskInds=..&hideClsSep=true">
|
| 37 |
+
<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
|
| 38 |
+
</a>
|
plm_model/microsoft/config.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertForMaskedLM"
|
| 4 |
+
],
|
| 5 |
+
"model_type": "bert",
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"hidden_act": "gelu",
|
| 8 |
+
"hidden_dropout_prob": 0.1,
|
| 9 |
+
"hidden_size": 768,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 3072,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"num_attention_heads": 12,
|
| 14 |
+
"num_hidden_layers": 12,
|
| 15 |
+
"type_vocab_size": 2,
|
| 16 |
+
"vocab_size": 30522
|
| 17 |
+
}
|
plm_model/microsoft/flax_model.msgpack
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84761403b655e7d865093297cc57d574c5ec7ce705917f9d7011683c79f5fc41
|
| 3 |
+
size 437936109
|
plm_model/microsoft/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad7bbb66376cfd6b2db3447192b034efe016337cbef135c35c411fd61b13c193
|
| 3 |
+
size 440474434
|
plm_model/microsoft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_lower_case": true
|
| 3 |
+
}
|
plm_model/microsoft/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/2datasets_construct_predictions1.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/2datasets_qweninstruct_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/aav_07252307_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/ablation_deeplocbinary_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/ablation_fluorescence_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/ablation_gb1.0_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/ablation_gb1_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/ablation_material_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/ablation_metallonbinding_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/antibiotic_07262045_predictions.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|