Initial model upload

Browse files

Files changed (6) hide show

README.md +56 -0
config.yaml +19 -0
inference.py +40 -0
model.pt +3 -0
network.py +59 -0
tokenizer_mapping.json +27 -0

README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+---
+license: gpl-3.0
+tags:
+- protein
+- peptide
+- deep-learning
+- pytorch
+- bioinformatics
+library_name: pytorch
+---
+# Revised Peptide LGBM Model
+This repository contains a PyTorch deep learning model trained to predict peptide properties from amino acid sequences.
+## Model Description
+The model uses tokenized amino acid sequences as input and predicts a probability score indicating the likelihood of the peptide belonging to the positive class.
+The architecture is defined in `model/network.py` and initialized using a YAML configuration file.
+## Input Representation
+Sequences are tokenized using the following mapping:
+| Token | Description |
+|------|-------------|
+| PAD | Padding |
+| UNK | Unknown |
+| CLS | Start token |
+| SEP | Separator |
+| MASK | Mask token |
+| L,A,G,V,E,S,I,K,R,D,T,P,N,Q,F,Y,M,H,C,W | Amino acids |
+Sequences are padded to the maximum length within a batch.
+## Files
+| File | Description |
+|----|----|
+| model.pt | Trained model checkpoint |
+| config.yaml | Model configuration |
+| tokenizer_mapping.json | Amino acid token mapping |
+| inference.py | Example inference script |
+## Usage
+Example inference:
+```python
+from inference import predict
+sequence = "LAGVEST"
+probability = predict(sequence)
+print(probability)

config.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+epochs: 50
+batch_size: 32
+vocab_size: 25
+task: revised_peptide_LGBM_5  # hemo, sol, nf
+debug: false
+network:
+  hidden_size: 480
+  hidden_layers: 12
+  attn_heads: 12
+  dropout: 0.15
+optim:
+  lr: 1.0e-5
+sch:
+  name: lronplateau  # onecycle, lronplateau
+  factor: 0.1
+  patience: 4

inference.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import yaml
+import json
+from model.network import create_model
+from huggingface_hub import hf_hub_download
+repo_id = "YOUR_USERNAME/revised_peptide_LGBM_5"
+# download files
+model_path = hf_hub_download(repo_id, "model.pt")
+config_path = hf_hub_download(repo_id, "config.yaml")
+mapping_path = hf_hub_download(repo_id, "tokenizer_mapping.json")
+config = yaml.safe_load(open(config_path))
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+config["device"] = device
+model = create_model(config)
+model.load_state_dict(torch.load(model_path)["model_state_dict"], strict=False)
+model.to(device)
+model.eval()
+mapping = json.load(open(mapping_path))
+def predict(sequence):
+    tokens = [mapping.get(c, mapping["[UNK]"]) for c in sequence]
+    input_ids = torch.tensor([tokens]).to(device)
+    attention_mask = (input_ids != 0).float()
+    with torch.no_grad():
+        prob = model(input_ids, attention_mask)[0].item()
+    return prob
+print(predict("LAGVEST"))

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3fb16c1864d4c944a8133acd9fb358f699126ed4252174e79e0b9d653a2fc95
+size 564544429

network.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+from transformers import BertModel, BertConfig, logging
+logging.set_verbosity_error()
+class PeptideBERT(torch.nn.Module):
+    def __init__(self, bert_config):
+        super(PeptideBERT, self).__init__()
+        self.protbert = BertModel.from_pretrained(
+            'Rostlab/prot_bert_bfd',
+            config=bert_config,
+            ignore_mismatched_sizes=True
+        )
+        self.head = torch.nn.Sequential(
+            torch.nn.Linear(bert_config.hidden_size, 1),
+            torch.nn.Sigmoid()
+        )
+    def forward(self, inputs, attention_mask):
+        output = self.protbert(inputs, attention_mask=attention_mask)
+        return self.head(output.pooler_output)
+def create_model(config):
+    bert_config = BertConfig(
+        vocab_size=config['vocab_size'],
+        hidden_size=config['network']['hidden_size'],
+        num_hidden_layers=config['network']['hidden_layers'],
+        num_attention_heads=config['network']['attn_heads'],
+        hidden_dropout_prob=config['network']['dropout']
+    )
+    model = PeptideBERT(bert_config).to(config['device'])
+    return model
+def cri_opt_sch(config, model):
+    criterion = torch.nn.BCELoss()
+    optimizer = torch.optim.AdamW(model.parameters(), lr=config['optim']['lr'])
+    if config['sch']['name'] == 'onecycle':
+        scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            optimizer,
+            max_lr=config['optim']['lr'],
+            epochs=config['epochs'],
+            steps_per_epoch=config['sch']['steps']
+        )
+    elif config['sch']['name'] == 'lronplateau':
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            optimizer,
+            mode='max',
+            factor=config['sch']['factor'],
+            patience=config['sch']['patience']
+        )
+    return criterion, optimizer, scheduler

tokenizer_mapping.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "[PAD]": 0,
+    "[UNK]": 1,
+    "[CLS]": 2,
+    "[SEP]": 3,
+    "[MASK]": 4,
+    "L": 5,
+    "A": 6,
+    "G": 7,
+    "V": 8,
+    "E": 9,
+    "S": 10,
+    "I": 11,
+    "K": 12,
+    "R": 13,
+    "D": 14,
+    "T": 15,
+    "P": 16,
+    "N": 17,
+    "Q": 18,
+    "F": 19,
+    "Y": 20,
+    "M": 21,
+    "H": 22,
+    "C": 23,
+    "W": 24
+}