yuccaaa commited on Sep 4, 2025

Commit

dcb9c13

verified ·

1 Parent(s): 4d12519

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
data/protein-text/eval_assist.zipltqrmb6_.tmp +3 -0
llm_model/model-00003-of-00004.safetensors +3 -0
model/__pycache__/blip2.cpython-310.pyc +0 -0
model/__pycache__/blip2.cpython-311.pyc +0 -0
model/__pycache__/blip2.cpython-38.pyc +0 -0
model/__pycache__/blip2_opt.cpython-310.pyc +0 -0
model/__pycache__/blip2_opt.cpython-311.pyc +0 -0
model/__pycache__/blip2_stage1.cpython-310.pyc +0 -0
model/__pycache__/blip2_stage1.cpython-311.pyc +0 -0
model/__pycache__/blip2_stage1.cpython-38.pyc +0 -0
model/__pycache__/blip2_stage2.cpython-310.pyc +0 -0
model/__pycache__/blip2_stage3.cpython-310.pyc +0 -0
model/__pycache__/blip2_stage3.cpython-311.pyc +0 -0
model/__pycache__/blip2qformer.cpython-310.pyc +0 -0
model/__pycache__/blip2qformer.cpython-311.pyc +0 -0
model/__pycache__/blip2qformer.cpython-38.pyc +0 -0
model/__pycache__/dist_funs.cpython-310.pyc +0 -0
model/__pycache__/help_funcs.cpython-310.pyc +0 -0
model/__pycache__/opt_flash_attention.cpython-310.pyc +0 -0
model/__pycache__/opt_flash_attention.cpython-311.pyc +0 -0
model/__pycache__/opt_flash_attention.cpython-313.pyc +0 -0
model/test.py +34 -0
plm_model/esm2-150m/.gitattributes +33 -0
plm_model/esm2-150m/README.md +20 -0
plm_model/esm2-150m/config.json +30 -0
plm_model/esm2-150m/model.safetensors +3 -0
plm_model/esm2-150m/pytorch_model.bin +3 -0
plm_model/esm2-150m/special_tokens_map.json +7 -0
plm_model/esm2-150m/tf_model.h5 +3 -0
plm_model/esm2-150m/tokenizer_config.json +4 -0
plm_model/esm2-150m/vocab.txt +33 -0
plm_model/microsoft/.gitattributes +9 -0
plm_model/microsoft/LICENSE.md +21 -0
plm_model/microsoft/README.md +38 -0
plm_model/microsoft/config.json +17 -0
plm_model/microsoft/flax_model.msgpack +3 -0
plm_model/microsoft/pytorch_model.bin +3 -0
plm_model/microsoft/tokenizer_config.json +3 -0
plm_model/microsoft/vocab.txt +0 -0
results/2datasets_construct_predictions1.txt +0 -0
results/2datasets_qweninstruct_predictions.txt +0 -0
results/aav_07252307_predictions.txt +0 -0
results/ablation_deeplocbinary_predictions.txt +0 -0
results/ablation_fluorescence_predictions.txt +0 -0
results/ablation_gb1.0_predictions.txt +0 -0
results/ablation_gb1_predictions.txt +0 -0
results/ablation_material_predictions.txt +0 -0
results/ablation_metallonbinding_predictions.txt +0 -0
results/antibiotic_07262045_predictions.txt +0 -0

.gitattributes CHANGED Viewed

@@ -54,3 +54,4 @@ data_small/OntoProteinDatasetV2/train.txt filter=lfs diff=lfs merge=lfs -text
 data_small/PDBDataset/abstract.json filter=lfs diff=lfs merge=lfs -text
 data_small/PDBDataset/qa_all.json filter=lfs diff=lfs merge=lfs -text
 data_small/SwissProtV3/train_set_.jsonl filter=lfs diff=lfs merge=lfs -text

 data_small/PDBDataset/abstract.json filter=lfs diff=lfs merge=lfs -text
 data_small/PDBDataset/qa_all.json filter=lfs diff=lfs merge=lfs -text
 data_small/SwissProtV3/train_set_.jsonl filter=lfs diff=lfs merge=lfs -text
+data/protein-text/eval_assist.zipltqrmb6_.tmp filter=lfs diff=lfs merge=lfs -text

data/protein-text/eval_assist.zipltqrmb6_.tmp ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6092fb4729236af66e921aff8d7330012bc0eb1428240856251cd14ed23b45e9
+size 8291512692

llm_model/model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e22df05cdc66407dd05e130ce0e47ddf56e2d3ee6ef6eef9249b07ecb4e9d7
+size 4333083488

model/__pycache__/blip2.cpython-310.pyc ADDED Viewed

Binary file (3.17 kB). View file

model/__pycache__/blip2.cpython-311.pyc ADDED Viewed

Binary file (5.11 kB). View file

model/__pycache__/blip2.cpython-38.pyc ADDED Viewed

Binary file (3.11 kB). View file

model/__pycache__/blip2_opt.cpython-310.pyc ADDED Viewed

Binary file (9.74 kB). View file

model/__pycache__/blip2_opt.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

model/__pycache__/blip2_stage1.cpython-310.pyc ADDED Viewed

Binary file (11.9 kB). View file

model/__pycache__/blip2_stage1.cpython-311.pyc ADDED Viewed

Binary file (33.7 kB). View file

model/__pycache__/blip2_stage1.cpython-38.pyc ADDED Viewed

Binary file (13.7 kB). View file

model/__pycache__/blip2_stage2.cpython-310.pyc ADDED Viewed

Binary file (14.1 kB). View file

model/__pycache__/blip2_stage3.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

model/__pycache__/blip2_stage3.cpython-311.pyc ADDED Viewed

Binary file (21.1 kB). View file

model/__pycache__/blip2qformer.cpython-310.pyc ADDED Viewed

Binary file (7.54 kB). View file

model/__pycache__/blip2qformer.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

model/__pycache__/blip2qformer.cpython-38.pyc ADDED Viewed

Binary file (7.46 kB). View file

model/__pycache__/dist_funs.cpython-310.pyc ADDED Viewed

Binary file (6.71 kB). View file

model/__pycache__/help_funcs.cpython-310.pyc ADDED Viewed

Binary file (3.96 kB). View file

model/__pycache__/opt_flash_attention.cpython-310.pyc ADDED Viewed

Binary file (7.2 kB). View file

model/__pycache__/opt_flash_attention.cpython-311.pyc ADDED Viewed

Binary file (15.8 kB). View file

model/__pycache__/opt_flash_attention.cpython-313.pyc ADDED Viewed

Binary file (15.2 kB). View file

model/test.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+llm_tokenizer = AutoTokenizer.from_pretrained("/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", use_fast=False, padding_side='right')
+llm_tokenizer.add_special_tokens({'pad_token': '<pad>'})
+llm_model = AutoModelForCausalLM.from_pretrained("/oss/wangyujia/BIO/pretrain_output/qwen2.5-7b-instruct-bio/bio_all/save1epoch/checkpoint-1300", torch_dtype=torch.bfloat16)
+llm_model.resize_token_embeddings(len(llm_tokenizer))
+text = "You need to answer the following question directly, which means you can only give the number of the option in the answer. For example: <ANSWER>option number</ANSWER> Based on the following protein\'s amino acid sequence, is the protein located on the membrane? Swiss-Prot description for P86987,Options:\n0.Yes\n1.No"
+# Step 1: 编码成 input_ids 和 attention_mask（注意要 tensor 类型）
+inputs = llm_tokenizer(text, return_tensors="pt")
+input_ids = inputs["input_ids"]             # shape: [1, L]
+attention_mask = inputs["attention_mask"]   # shape: [1, L]
+# Step 2: 通过模型 embedding 层获取 inputs_embeds
+with torch.no_grad():
+    inputs_embeds = llm_model.get_input_embeddings()(input_ids)
+outputs = llm_model.generate(
+    inputs_embeds=inputs_embeds,
+    attention_mask=attention_mask,
+    max_length=128,
+    min_length=1,
+    use_cache=True,
+    cache_implementation="hybrid"
+)
+output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+output_text = [text.strip() for text in output_text]
+print(output_text)

plm_model/esm2-150m/.gitattributes ADDED Viewed

	@@ -0,0 +1,33 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

plm_model/esm2-150m/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+license: mit
+widget:
+ - text: "MQIFVKTLTGKTITLEVEPS<mask>TIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
+---
+## ESM-2
+ESM-2 is a state-of-the-art protein model trained on a masked language modelling objective. It is suitable for fine-tuning on a wide range of tasks that take protein sequences as input. For detailed information on the model architecture and training data, please refer to the [accompanying paper](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2). You may also be interested in some demo notebooks ([PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb), [TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb)) which demonstrate how to fine-tune ESM-2 models on your tasks of interest.
+Several ESM-2 checkpoints are available in the Hub with varying sizes. Larger sizes generally have somewhat better accuracy, but require much more memory and time to train:
+| Checkpoint name | Num layers | Num parameters |
+|------------------------------|----|----------|
+| [esm2_t48_15B_UR50D](https://huggingface.co/facebook/esm2_t48_15B_UR50D) | 48 | 15B     |
+| [esm2_t36_3B_UR50D](https://huggingface.co/facebook/esm2_t36_3B_UR50D) | 36 | 3B      |
+| [esm2_t33_650M_UR50D](https://huggingface.co/facebook/esm2_t33_650M_UR50D) | 33 | 650M    |
+| [esm2_t30_150M_UR50D](https://huggingface.co/facebook/esm2_t30_150M_UR50D) | 30 | 150M    |
+| [esm2_t12_35M_UR50D](https://huggingface.co/facebook/esm2_t12_35M_UR50D) | 12 | 35M     |
+| [esm2_t6_8M_UR50D](https://huggingface.co/facebook/esm2_t6_8M_UR50D)  | 6  | 8M      |

plm_model/esm2-150m/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/tmp/facebook/esm2_t30_150M_UR50D",
+  "architectures": [
+    "EsmForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "classifier_dropout": null,
+  "emb_layer_norm_before": false,
+  "esmfold_config": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 640,
+  "initializer_range": 0.02,
+  "intermediate_size": 2560,
+  "is_folding_model": false,
+  "layer_norm_eps": 1e-05,
+  "mask_token_id": 32,
+  "max_position_embeddings": 1026,
+  "model_type": "esm",
+  "num_attention_heads": 20,
+  "num_hidden_layers": 30,
+  "pad_token_id": 1,
+  "position_embedding_type": "rotary",
+  "token_dropout": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.0.dev0",
+  "use_cache": true,
+  "vocab_list": null,
+  "vocab_size": 33
+}

plm_model/esm2-150m/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f1da8aea53bddd32c246c86168c23b9fd72341fb9db9a94436f855f5053566
+size 595257706

plm_model/esm2-150m/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a88feb574b9f4e31c45762961d1b2ddba95796db86fb480d207b4d15e6ec8aab
+size 595364077

plm_model/esm2-150m/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

plm_model/esm2-150m/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78b04a25f0ded4bd3a7665d4589863984117384b239cee6de97c46aefec4a1b5
+size 593355136

plm_model/esm2-150m/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "EsmTokenizer"
+}

plm_model/esm2-150m/vocab.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+<cls>
+<pad>
+<eos>
+<unk>
+L
+A
+G
+V
+S
+E
+R
+T
+I
+D
+P
+K
+Q
+N
+F
+Y
+M
+H
+W
+C
+X
+B
+U
+Z
+O
+.
+-
+<null_1>
+<mask>

plm_model/microsoft/.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

plm_model/microsoft/LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Microsoft Corporation
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR OTHER DEALINGS IN THE SOFTWARE.

plm_model/microsoft/README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+language: en
+tags:
+- exbert
+license: mit
+widget:
+- text: "[MASK] is a tumor suppressor gene."
+---
+## MSR BiomedBERT (abstracts + full text)
+<div style="border: 2px solid orange; border-radius:10px; padding:0px 10px; width: fit-content;">
+* This model was previously named **"PubMedBERT (abstracts + full text)"**.
+* You can either adopt the new model name "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext" or update your `transformers` library to version 4.22+ if you need to refer to the old name.
+</div>
+Pretraining large neural language models, such as BERT, has led to impressive gains on many natural language processing (NLP) tasks. However, most pretraining efforts focus on general domain corpora, such as newswire and Web. A prevailing assumption is that even domain-specific pretraining can benefit by starting from general-domain language models. [Recent work](https://arxiv.org/abs/2007.15779) shows that for domains with abundant unlabeled text, such as biomedicine, pretraining language models from scratch results in substantial gains over continual pretraining of general-domain language models.
+BiomedBERT is pretrained from scratch using _abstracts_ from [PubMed](https://pubmed.ncbi.nlm.nih.gov/) and _full-text_ articles from [PubMedCentral](https://www.ncbi.nlm.nih.gov/pmc/). This model achieves state-of-the-art performance on many biomedical NLP tasks, and currently holds the top score on the [Biomedical Language Understanding and Reasoning Benchmark](https://aka.ms/BLURB).
+## Citation
+If you find BiomedBERT useful in your research, please cite the following paper:
+```latex
+@misc{pubmedbert,
+  author = {Yu Gu and Robert Tinn and Hao Cheng and Michael Lucas and Naoto Usuyama and Xiaodong Liu and Tristan Naumann and Jianfeng Gao and Hoifung Poon},
+  title = {Domain-Specific Language Model Pretraining for Biomedical Natural Language Processing},
+  year = {2020},
+  eprint = {arXiv:2007.15779},
+}
+```
+<a href="https://huggingface.co/exbert/?model=microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext&modelKind=bidirectional&sentence=Gefitinib%20is%20an%20EGFR%20tyrosine%20kinase%20inhibitor,%20which%20is%20often%20used%20for%20breast%20cancer%20and%20NSCLC%20treatment.&layer=3&heads=..0,1,2,3,4,5,6,7,8,9,10,11&threshold=0.7&tokenInd=17&tokenSide=right&maskInds=..&hideClsSep=true">
+	<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
+</a>

plm_model/microsoft/config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "architectures": [
+    "BertForMaskedLM"
+  ],
+  "model_type": "bert",
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}

plm_model/microsoft/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84761403b655e7d865093297cc57d574c5ec7ce705917f9d7011683c79f5fc41
+size 437936109

plm_model/microsoft/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad7bbb66376cfd6b2db3447192b034efe016337cbef135c35c411fd61b13c193
+size 440474434

plm_model/microsoft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "do_lower_case": true
+}

plm_model/microsoft/vocab.txt ADDED Viewed