Spaces:

DrGabrielLopez
/

arXiv-tool

Build error

gabriel lopez commited on Nov 13, 2022

Commit

e84989e

1 Parent(s): 44b6718

cosmetics

Files changed (2) hide show

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ EXAMPLES = [
 ARTICLE = r"""<center>
               This application uses Sentence-BERT embeddings.
               Sentence Embedding is achieved here via Siamese BERT-Networks from  <a href=https://arxiv.org/abs/1908.10084>this paper</a> <br>
               Done by dr. Gabriel Lopez<br>
               For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a>
               </center>"""

 ARTICLE = r"""<center>
               This application uses Sentence-BERT embeddings.
               Sentence Embedding is achieved here via Siamese BERT-Networks from  <a href=https://arxiv.org/abs/1908.10084>this paper</a> <br>
+              After embedding, encoded papers are projected into the unit sphere and a nearest neighbours search is done to extract best matching results.
               Done by dr. Gabriel Lopez<br>
               For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a>
               </center>"""

arxiv_tool/core.py CHANGED Viewed

@@ -14,6 +14,7 @@ class SentenceEncoder:
     """
     def load_and_encode(self):
         # load
         df = self._load()
         # encode
@@ -21,7 +22,7 @@ class SentenceEncoder:
         return df, model, embeddings
     def transform(self, df, querry, model, embeddings):
-        """ """
         # create_index
         emb_querry = self._econde_querry(querry, model)
         # search
@@ -50,6 +51,7 @@ class SentenceEncoder:
         return emb_querry
     def _make_search(self, df, emb_querry, embeddings):
         # initialize a new index, using a HNSW index on Cosine Similarity
         index = nmslib.init(method="hnsw", space="cosinesimil")
         index.addDataPointBatch(embeddings)
@@ -73,6 +75,7 @@ class SentenceEncoder:
         return pd.DataFrame(data)
     def _add_relevant_columns(self, df, result):
         # get categories
         df["categories_parsed"] = (
             df.categories.str.split()

     """
     def load_and_encode(self):
+        """prepare data before running search querry"""
         # load
         df = self._load()
         # encode
         return df, model, embeddings
     def transform(self, df, querry, model, embeddings):
+        """main querry pipeline"""
         # create_index
         emb_querry = self._econde_querry(querry, model)
         # search
         return emb_querry
     def _make_search(self, df, emb_querry, embeddings):
+        """search for nearest K neighbours in the embedding space"""
         # initialize a new index, using a HNSW index on Cosine Similarity
         index = nmslib.init(method="hnsw", space="cosinesimil")
         index.addDataPointBatch(embeddings)
         return pd.DataFrame(data)
     def _add_relevant_columns(self, df, result):
+        """post processing"""
         # get categories
         df["categories_parsed"] = (
             df.categories.str.split()