Spaces:

isyslab
/

LocPred-Prok

Running

App Files Files Community

wangleiofficial commited on Sep 2, 2025

Commit

7b163dc

verified ·

1 Parent(s): ab6d96f

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -49

app.py CHANGED Viewed

@@ -55,7 +55,6 @@ PLM_MODEL_NAME = "facebook/esm2_t30_150M_UR50D"
 CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
 LABEL_MAP_PATH = "label_map.json"
-# Load the label map file
 try:
     with open(LABEL_MAP_PATH, 'r') as f:
         label_to_idx = json.load(f)
@@ -64,23 +63,18 @@ except FileNotFoundError:
     raise FileNotFoundError(f"Error: Could not find '{LABEL_MAP_PATH}'. Please make sure this file is uploaded to the Space.")
 NUM_CLASSES = len(idx_to_label)
-D_MODEL = 640 # Dimension for esm2_t30_150M_UR50D
-# Load Protein Language Model (PLM) and tokenizer
 print("Loading Protein Language Model...")
 tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
 plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
 plm_model.eval()
 print("PLM loaded successfully.")
-# Load your trained downstream classifier
 print("Loading downstream classifier...")
 classifier = ProtDualBranchEnhancedClassifier(
-    d_model=D_MODEL,
-    projection_dim=32,
-    num_classes=NUM_CLASSES,
-    dropout=0.3,
-    kernel_size=3
 ).to(DEVICE)
 if not os.path.exists(CLASSIFIER_PATH):
@@ -92,24 +86,15 @@ print("Classifier loaded. Application is ready!")
 # --- 3. Prediction Function ---
 def predict(sequence_input):
-    """
-    Receives a protein sequence and returns a dictionary of class probabilities.
-    """
     if not sequence_input or sequence_input.isspace():
         return {"Error": "Please enter a protein sequence."}
-    # Clean the input, support FASTA format
-    if sequence_input.startswith('>'):
-        sequence = "".join(sequence_input.split('\n')[1:])
-    else:
-        sequence = sequence_input
     sequence = re.sub(r'[^A-Z]', '', sequence.upper())
     if not sequence:
         return {"Error": "Sequence is empty after cleaning. Please enter a valid amino acid sequence."}
-    # Feature extraction with PLM
     with torch.no_grad():
         inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
         outputs = plm_model(**inputs)
@@ -117,41 +102,68 @@ def predict(sequence_input):
         hidden_states = outputs.last_hidden_state
         cls_embedding = hidden_states[:, 0, :]
         token_embeddings = hidden_states[:, 1:-1, :]
-        token_mask = inputs['attention_mask'][:, 2:]
-    # Prediction with the downstream classifier
     with torch.no_grad():
         logits = classifier(cls_embedding, token_embeddings, token_mask)
         probabilities = F.softmax(logits, dim=1)[0]
-    # Format the output
     confidences = {idx_to_label[i]: float(prob) for i, prob in enumerate(probabilities)}
     return confidences
-# --- 4. Create Gradio Interface ---
-title = "Predicting the subcellular location of prokaryotic proteins with LocPred-Prok"
-description = """
-This is a prediction tool based on the **ESM-2 (150M)** Protein Language Model and a custom **`dual_branch_enhanced`** classifier.
-Simply paste a protein's amino acid sequence (FASTA format or raw sequence are both supported) into the text box below, and the model will predict its localization within the cell.
-"""
-examples = [
-    [">sp|P27361|PBP2_ECOLI Penicillin-binding protein 2 OS=Escherichia coli (strain K12) OX=83333 GN=mrdA PE=1 SV=2\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
-    ["MSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
-]
-gr.Interface(
-    fn=predict,
-    inputs=gr.Textbox(
-        lines=10,
-        label="Protein Sequence",
-        placeholder="Paste your amino acid sequence here..."
-    ),
-    outputs=gr.Label(num_top_classes=NUM_CLASSES, label="Prediction Results"),
-    title=title,
-    description=description,
-    examples=examples,
-    allow_flagging="never",
-    theme=gr.themes.Soft()
-).launch()

 CLASSIFIER_PATH = "best_model_esm2_t30_150M_UR50D.pth"
 LABEL_MAP_PATH = "label_map.json"
 try:
     with open(LABEL_MAP_PATH, 'r') as f:
         label_to_idx = json.load(f)
     raise FileNotFoundError(f"Error: Could not find '{LABEL_MAP_PATH}'. Please make sure this file is uploaded to the Space.")
 NUM_CLASSES = len(idx_to_label)
+D_MODEL = 640
 print("Loading Protein Language Model...")
 tokenizer = AutoTokenizer.from_pretrained(PLM_MODEL_NAME)
 plm_model = AutoModel.from_pretrained(PLM_MODEL_NAME).to(DEVICE)
 plm_model.eval()
 print("PLM loaded successfully.")
 print("Loading downstream classifier...")
 classifier = ProtDualBranchEnhancedClassifier(
+    d_model=D_MODEL, projection_dim=32, num_classes=NUM_CLASSES,
+    dropout=0.3, kernel_size=3
 ).to(DEVICE)
 if not os.path.exists(CLASSIFIER_PATH):
 # --- 3. Prediction Function ---
 def predict(sequence_input):
     if not sequence_input or sequence_input.isspace():
         return {"Error": "Please enter a protein sequence."}
+    sequence = "".join(sequence_input.split('\n')[1:]) if sequence_input.startswith('>') else sequence_input
     sequence = re.sub(r'[^A-Z]', '', sequence.upper())
     if not sequence:
         return {"Error": "Sequence is empty after cleaning. Please enter a valid amino acid sequence."}
     with torch.no_grad():
         inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=1024).to(DEVICE)
         outputs = plm_model(**inputs)
         hidden_states = outputs.last_hidden_state
         cls_embedding = hidden_states[:, 0, :]
         token_embeddings = hidden_states[:, 1:-1, :]
+        token_mask = inputs['attention_mask'][:, 1:-1]
     with torch.no_grad():
         logits = classifier(cls_embedding, token_embeddings, token_mask)
         probabilities = F.softmax(logits, dim=1)[0]
     confidences = {idx_to_label[i]: float(prob) for i, prob in enumerate(probabilities)}
     return confidences
+# --- 4. Create Beautified Gradio Interface using Blocks ---
+with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 800px; margin: auto;}") as app:
+    gr.Markdown(
+        """
+        # Protein Subcellular Localization Prediction
+        An online prediction tool based on the **ESM-2 (150M)** Protein Language Model and a custom **`dual_branch_enhanced`** classifier.
+        Just paste the amino acid sequence of a protein (FASTA format or raw sequence are supported), and the model will predict its location within the cell.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            sequence_input = gr.Textbox(
+                lines=10,
+                label="Protein Sequence",
+                placeholder="Paste your amino acid sequence here..."
+            )
+            with gr.Row():
+                clear_btn = gr.ClearButton()
+                submit_btn = gr.Button("🚀 Predict", variant="primary")
+            gr.Examples(
+                examples=[
+                    [">sp|P27361|PBP2_ECOLI Penicillin-binding protein 2 OS=Escherichia coli (strain K12) OX=83333 GN=mrdA PE=1 SV=2\nMKFKLTAGCLAVAGVLLASSFGADAEIVVNAIYDQVARTEDGVYTQGQLTGRRIELLNKLGIEPEDSLASTVIHEFVARVGDDHGIETIIDEFYRQHPSASL"],
+                    ["MSKLVKTLTISEISKAQNNGGKPAWCWYTLAMCGAGYDSGTCDYMYSHCFGIKHHSSGSSSYHC"],
+                ],
+                inputs=sequence_input,
+                label="Examples"
+            )
+        with gr.Column(scale=1):
+            output_label = gr.Label(num_top_classes=NUM_CLASSES, label="Prediction Results")
+            with gr.Accordion("Model Information", open=False):
+                gr.Markdown(
+                    """
+                    * **Protein Language Model (PLM)**: `facebook/esm2_t30_150M_UR50D`
+                    * **Downstream Classifier**: `ProtDualBranchEnhancedClassifier`
+                    * **GitHub Repository**: github.com/isyslab-hust
+                    """
+                )
+    gr.Markdown(
+        """
+        ---
+        *Built by isyslab*
+        """
+    )
+    submit_btn.click(fn=predict, inputs=sequence_input, outputs=output_label, api_name="predict")
+    clear_btn.click(lambda: [None, None], outputs=[sequence_input, output_label])
+app.launch()