Spaces:

ronig
/

protein_binding_search

Running

App Files Files Community

roni commited on Jun 27, 2023

Commit

e873d33

1 Parent(s): 7217cfd

App switched to use Milvus instead of Annoy

Browse files

Files changed (8) hide show

Makefile +1 -1
app.py +22 -13
get_index.py +0 -36
index_list.py +11 -0
pylintrc +0 -20
requirements-dev.txt +1 -1
requirements.txt +2 -1
search_engine.py +113 -0

Makefile CHANGED Viewed

@@ -12,4 +12,4 @@ check-formatting:
 	venv/bin/black --check .
 lint-python:
-	venv/bin/pylint --rcfile=pylintrc .

 	venv/bin/black --check .
 lint-python:
+	venv/bin/ruff .

app.py CHANGED Viewed

@@ -1,31 +1,40 @@
 import collections
 from typing import Dict, List
 import gradio as gr
-from get_index import get_engines
 from protein_viz import get_pdb_title, render_html
-index_repo = "ronig/protein_index"
-model_repo = "ronig/protein_search_engine"
-engines = get_engines(index_repo, model_repo)
-available_indexes = list(engines.keys())
-app_description = """
-# Protein Binding Search Engine
-This application enables a quick protein-peptide binding search based on sequences.
-You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
-"""
 max_results = 1000
 choice_sep = " | "
 max_seq_length = 50
 def search_and_display(seq, max_res, index_selection):
-    n_search_res = 10000
     _validate_sequence_length(seq)
     max_res = int(limit_n_results(max_res))
-    engine = engines[index_selection]
-    search_res = engine.search_by_sequence(seq, n=n_search_res)
     agg_search_results = aggregate_search_results(search_res, max_res)
     formatted_search_results = format_search_results(agg_search_results)
     results_options = update_dropdown_menu(agg_search_results)

 import collections
+import os
 from typing import Dict, List
 import gradio as gr
+from index_list import read_index_list
 from protein_viz import get_pdb_title, render_html
+from search_engine import MilvusParams, ProteinSearchEngine
+model_repo = "ronig/protein_biencoder"
+available_indexes = read_index_list()
+engine = ProteinSearchEngine(
+    milvus_params=MilvusParams(
+        uri="https://in03-ddab8e9a5a09fcc.api.gcp-us-west1.zillizcloud.com",
+        token=os.environ.get("MILVUS_TOKEN"),
+        db_name="Protein",
+        collection_name="Peptriever",
+    ),
+    model_repo=model_repo,
+)
 max_results = 1000
 choice_sep = " | "
 max_seq_length = 50
 def search_and_display(seq, max_res, index_selection):
+    n_search_res = 1024
     _validate_sequence_length(seq)
     max_res = int(limit_n_results(max_res))
+    if index_selection == "All Species":
+        index_selection = None
+    search_res = engine.search_by_sequence(
+        seq, n=n_search_res, organism=index_selection
+    )
     agg_search_results = aggregate_search_results(search_res, max_res)
     formatted_search_results = format_search_results(agg_search_results)
     results_options = update_dropdown_menu(agg_search_results)

get_index.py DELETED Viewed

@@ -1,36 +0,0 @@
-import os.path
-import sys
-from glob import glob
-from pathlib import Path
-from huggingface_hub import snapshot_download
-from credentials import get_token
-def get_engines(index_repo: str, model_repo: str):
-    index_path = Path(
-        snapshot_download(index_repo, use_auth_token=get_token(), repo_type="dataset")
-    )
-    local_arch_path = Path(
-        snapshot_download(model_repo, use_auth_token=get_token(), repo_type="model")
-    )
-    sys.path.append(str(local_arch_path))
-    from protein_index import (  # pylint: disable=import-error,import-outside-toplevel
-        ProteinSearchEngine,
-        ProteinIndexError,
-    )
-    subindex_paths = glob(str(index_path / "*/"))
-    engines = {}
-    for subindex_path in subindex_paths:
-        subindex_name = os.path.basename(subindex_path)
-        try:
-            engine = ProteinSearchEngine(data_path=Path(subindex_path))
-            if len(engine) > 1000:
-                engines[subindex_name] = engine
-        except ProteinIndexError:
-            ...
-    return engines

index_list.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os.path
+def read_index_list():
+    here = os.path.dirname(__file__)
+    fname = os.path.join(here, "available_organisms.txt")
+    indexes = ["All Species"]
+    with open(fname) as f:
+        for index in f:
+            indexes.append(index.strip())
+    return indexes

pylintrc DELETED Viewed

@@ -1,20 +0,0 @@
-[MESSAGES CONTROL]
-disable=missing-docstring,invalid-name,logging-fstring-interpolation
-[DESIGN]
-min-public-methods=1
-[FORMAT]
-max-line-length=88
-[SIMILARITIES]
-min-similarity-lines=10
-[TYPECHECK]
-[MASTER]
-init-hook=import sys; sys.path.append(".")
-extension-pkg-whitelist=pydantic,cassandra
-generated-members=torch.*,cv2.*,np.random.*
-ignore-patterns=setup,py,tasks.py
-max-args=6

requirements-dev.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 pytest
-pylint
 black
 mypy
 huggingface_hub

 pytest
+ruff
 black
 mypy
 huggingface_hub

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch
 transformers
 annoy
-mygene

 torch
 transformers
 annoy
+mygene
+pymilvus

search_engine.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import dataclasses
+import math
+from typing import List, Optional
+import torch
+from pymilvus import MilvusClient, connections
+from transformers import AutoModel, AutoTokenizer
+from credentials import get_token
+@dataclasses.dataclass
+class MilvusParams:
+    uri: str
+    token: str
+    db_name: str
+    collection_name: str
+class ProteinSearchEngine:
+    n_dims = 128
+    dist_metric = "euclidean"
+    max_lengths = (30, 300)
+    def __init__(self, milvus_params: MilvusParams, model_repo: str):
+        self.model_repo = model_repo
+        self.milvus_params = milvus_params
+        connections.connect(
+            "default",
+            uri=milvus_params.uri,
+            token=milvus_params.token,
+            db_name=milvus_params.db_name,
+        )
+        self.client = MilvusClient(uri=milvus_params.uri, token=milvus_params.token)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_repo, use_auth_token=get_token()
+        )
+        self.model = AutoModel.from_pretrained(
+            self.model_repo, use_auth_token=get_token(), trust_remote_code=True
+        )
+        self.model.eval()
+    def search_by_sequence(self, sequence: str, n: int, organism: Optional[str] = None):
+        max_length = self.max_lengths[0]
+        vec = self._embed_sequence(max_length, sequence)
+        response = self.search(vec, n_results=n, is_peptide=False, organism=organism)
+        search_results = self._format_search_results(response)
+        return search_results
+    def _embed_sequence(self, max_length, sequence):
+        encoded = self.tokenizer.encode_plus(
+            sequence,
+            add_special_tokens=True,
+            truncation=True,
+            max_length=max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            vec = (
+                self.model.forward1(encoded.to(self.model.device))
+                .squeeze()
+                .cpu()
+                .numpy()
+            )
+        return vec
+    def _format_search_results(self, response):
+        search_results = []
+        max_dist = math.sqrt(2 * self.n_dims)
+        for res in response:
+            entry = res["entity"]
+            dist = math.sqrt(res["distance"])
+            entry["dist"] = dist
+            entry["score"] = (max_dist - dist) / max_dist
+            search_results.append(entry)
+        return search_results
+    def search(
+        self,
+        vec: List[float],
+        n_results: int,
+        is_peptide: bool,
+        organism: Optional[str] = None,
+    ):
+        is_peptide = bool(is_peptide)
+        filter_str = f"is_peptide == {is_peptide}"
+        if organism is not None:
+            filter_str += f" and organism == '{organism}'"
+        results = self.client.search(
+            collection_name=self.milvus_params.collection_name,
+            data=[vec],
+            limit=n_results,
+            output_fields=[
+                "genes",
+                "uniprot_id",
+                "pdb_name",
+                "chain_id",
+                "is_peptide",
+                "organism",
+            ],
+            filter=filter_str,
+        )
+        return results[0]
+    def get_organisms(self):
+        res = self.client.query(
+            collection_name=self.milvus_params.collection_name,
+            output_fields=["organism"],
+            filter="entry_id > 0",
+        )
+        return res