Spaces:

NicolaiSivesind
/

CleavageSiteClassification

Sleeping

App Files Files Community

Brilleslangen commited on Apr 10, 2025

Commit

aec9df8

1 Parent(s): b385c11

App for cleavage site prediction.

Browse files

Files changed (8) hide show

app.py +39 -0
example_inputs.csv +21 -0
model.pt +3 -0
model.py +65 -0
requirements.txt +6 -0
tokenizer/special_tokens_map.json +7 -0
tokenizer/tokenizer_config.json +53 -0
tokenizer/vocab.txt +33 -0

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import gradio as gr
+import torch
+import pandas as pd
+from transformers import EsmTokenizer
+from model import CleavageSiteModel
+# Load tokenizer and model
+tokenizer = EsmTokenizer.from_pretrained("tokenizer")  # Path to tokenizer folder
+model = CleavageSiteModel(num_classes=75, base_model="facebook/esm2_t30_150M_UR50D")
+model.load_state_dict(torch.load("model.pt", map_location="cpu"))
+model.eval()
+# Load example sequences and labels from CSV
+examples_df = pd.read_csv("example_inputs.csv")
+examples = examples_df[["sequence", "cleavage_site"]].values.tolist()
+# Inference function accepting both sequence and true label
+def predict(sequence, true_site):
+    inputs = tokenizer(sequence, return_tensors="pt", truncation=True, padding=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs["logits"]
+        prediction = logits.argmax(dim=1).item()
+    return f"Predicted cleavage site index: {prediction} (True: {true_site})"
+# Launch Gradio interface
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Textbox(label="Protein Sequence", lines=2),
+        gr.Number(label="True Cleavage Site")
+    ],
+    outputs=gr.Textbox(label="Model Output"),
+    examples=examples,
+    title="Signal Peptide Cleavage Site Predictor",
+    description="Select an example or enter your own protein sequence and (optionally) its known cleavage site index."
+).launch()

example_inputs.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+sequence,cleavage_site
+MKAVITLLFLACILVVTYGDLICGTNYCKDHPCTSPIARASCRSPATYRANHSGKCACCPACVTLLRERA,18
+MKIILTLSIFLICFLQLGQSVIDPSQNEVMSDLLFNLYGYDKSLDPCNSNSVECDDINSTSTIKTVISLN,19
+MKHLLTLALCFSSINAVAVTVPHKAVGTGIPEGSLQFLSLRASAPIGSAISRNNWAVTCDSAQSGNECNK,23
+MLFKSLSKLATAAAFFAGVATADDVPAIEVVGNKFFYSNNGSQFYIRGVAYQADTANETSGSTVNDPLAN,21
+MVRPKHQPGGLCLLLLLLCQFMEDRSAQAGNCWLRQAKNGRCQVLYKTELSKEECCSTGRLSTSWTEEDV,28
+METVLILCSLLAPVVLASAAEKEKEKDPFYYDYQTLRIGGLVFAVVLFSVGILLILSRRCKCSFNQKPRA,16
+MKIILILSIFLICFLQLGQSVIDPSQNEVMSDLLFNLYGYDKSLDPCNNNYVECEYINTTSTIQTVKSLS,19
+MERPVPSRLVPLPLLLLSSLSLLAARANADISMEACCTDGNQMANQHRDCSLPYTSESKECRMVQEQCCH,28
+MKSFVLLFCLAQLWGCHSIPLDPVAGYKEPACDDPDTEQAALAAVDYINKHLPRGYKHTLNQIDSVKVWP,17
+MLSLRVACLILSLASTVWTADTGTTSEFIEAGGDIRGPRIVERQPSQCKETDWPFCSDEDWNHKCPSGCR,18
+MNSVLFLTLAVCSSLAYGKEFVATVRQNYKENINQLLEQQIQKELAASYIYQAYASYFQRADVSLPGIKK,17
+MKSVQFCFLFCCWRAICCRSCELTNITITVEKEECSFCISINTTWCAGYCYTRDLVYKDPARPNIQKACT,18
+MVRARHQPGGLCLLLLLLCQFMEDRSAQAGNCWLRQAKNGRCQVLYKTELSKEECCSTGRLSTSWTEEDV,28
+MNSLVALVLLGQIIGSTLSSQVRGDLECDEKDAKEWTDTGVRYINEHKLHGYKYALNVIKNIVVVPWDGD,18
+MVKFLLLALALGVSCAHYQNLEVSPSEVDGKWYSLYIAADNKEKVSEGGPLRAYIKNVECIDECQTLKIT,15
+MWLLVSVILISRISSVGGEAMFCDFPKINHGILYDEEKYKPFSQVPTGEVFYYSCEYNFVSPSKSFWTRI,17
+MKPIFLVLLVATSAYAAPSVTINQYSDNEIPRDIDDGKASSVISRAWDYVDDTDKSIAILNVQEILKDMA,15
+MARNMNILTLFAVLIGSASAVYHPPSWTAWIAPKPWTAWKVHPPAWTAWKAHPPAWTAWKATPKPWTAWK,19
+MAEWLLSASWQRRAKAMTAAAGSAGRAAVPLLLCALLAPGGAYVLDDSDGLGREFDGIGAVSGGGATSRL,41
+MQRLCVYVLIFALALAAFSEASWKPRSQQPDAPLGTGANRDLELPWLEQQGPASHHRRQLGPQGPPHLVA,20

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b255bb785120a15e0046580dd94f1f982c487031d8ceb8ae53eec4f5e33b30b7
+size 595544586

model.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+from transformers import EsmModel, AutoModel, PreTrainedModel, AutoConfig
+import evaluate
+import numpy as np
+from sklearn.metrics import accuracy_score, classification_report
+import wandb
+accuracy_metric = evaluate.load("accuracy")
+precision_metric = evaluate.load("precision")
+recall_metric = evaluate.load("recall")
+f1_metric = evaluate.load("f1")
+class CleavageSiteModel(nn.Module):
+    def __init__(self, base_model, num_classes=75, class_weights=None):
+        super().__init__()
+        self.model = EsmModel.from_pretrained(base_model)
+        self.classifier = nn.Linear(self.model.config.hidden_size, num_classes)
+        if class_weights is not None:
+            # Create full-length weights tensor with zeros
+            weight_tensor = torch.zeros(num_classes)
+            for class_idx, weight in class_weights.items():
+                weight_tensor[class_idx] = weight
+            self.loss_fn = nn.CrossEntropyLoss(weight=weight_tensor)
+        else:
+            self.loss_fn = nn.CrossEntropyLoss()
+    def forward(self, input_ids, attention_mask, labels=None):
+        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+        cls_output = outputs.last_hidden_state[:, 0]
+        logits = self.classifier(cls_output)
+        if labels is not None:
+            loss = self.loss_fn(logits, labels)
+            return {"loss": loss, "logits": logits}
+        else:
+            return {"logits": logits}
+def compute_metrics(eval_pred):
+    # Computes classification metrics including overall accuracy and per-class accuracy.
+    logits, labels = eval_pred  # Extract model outputs and labels
+    predictions = np.argmax(logits, axis=1)  # Get predicted class
+    # Compute overall accuracy
+    accuracy = accuracy_score(labels, predictions)
+    report = classification_report(labels, predictions, digits=4)
+    wandb.log({"classification_report": wandb.Html(report.replace('\n', '<br>'))})
+    # Compute per-class accuracy
+    unique_classes = np.unique(labels)
+    per_class_acc = {}
+    for cls in unique_classes:
+        class_mask = labels == cls  # Select samples belonging to this class
+        per_class_acc[f"accuracy_class_{cls}"] = (predictions[class_mask] == labels[class_mask]).mean()
+    # Log metrics
+    wandb.log({"overall_accuracy": accuracy, **per_class_acc})
+    return {"accuracy": accuracy, **per_class_acc}

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+gradio
+evaluate
+scikit-learn
+wandb

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "EsmTokenizer",
+  "unk_token": "<unk>"
+}

tokenizer/vocab.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+<cls>
+<pad>
+<eos>
+<unk>
+L
+A
+G
+V
+S
+E
+R
+T
+I
+D
+P
+K
+Q
+N
+F
+Y
+M
+H
+W
+C
+X
+B
+U
+Z
+O
+.
+-
+<null_1>
+<mask>