Spaces:
Build error
Build error
gabriel lopez commited on
Commit ·
e84989e
1
Parent(s): 44b6718
cosmetics
Browse files- app.py +1 -0
- arxiv_tool/core.py +4 -1
app.py
CHANGED
|
@@ -13,6 +13,7 @@ EXAMPLES = [
|
|
| 13 |
ARTICLE = r"""<center>
|
| 14 |
This application uses Sentence-BERT embeddings.
|
| 15 |
Sentence Embedding is achieved here via Siamese BERT-Networks from <a href=https://arxiv.org/abs/1908.10084>this paper</a> <br>
|
|
|
|
| 16 |
Done by dr. Gabriel Lopez<br>
|
| 17 |
For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a>
|
| 18 |
</center>"""
|
|
|
|
| 13 |
ARTICLE = r"""<center>
|
| 14 |
This application uses Sentence-BERT embeddings.
|
| 15 |
Sentence Embedding is achieved here via Siamese BERT-Networks from <a href=https://arxiv.org/abs/1908.10084>this paper</a> <br>
|
| 16 |
+
After embedding, encoded papers are projected into the unit sphere and a nearest neighbours search is done to extract best matching results.
|
| 17 |
Done by dr. Gabriel Lopez<br>
|
| 18 |
For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a>
|
| 19 |
</center>"""
|
arxiv_tool/core.py
CHANGED
|
@@ -14,6 +14,7 @@ class SentenceEncoder:
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
def load_and_encode(self):
|
|
|
|
| 17 |
# load
|
| 18 |
df = self._load()
|
| 19 |
# encode
|
|
@@ -21,7 +22,7 @@ class SentenceEncoder:
|
|
| 21 |
return df, model, embeddings
|
| 22 |
|
| 23 |
def transform(self, df, querry, model, embeddings):
|
| 24 |
-
""" """
|
| 25 |
# create_index
|
| 26 |
emb_querry = self._econde_querry(querry, model)
|
| 27 |
# search
|
|
@@ -50,6 +51,7 @@ class SentenceEncoder:
|
|
| 50 |
return emb_querry
|
| 51 |
|
| 52 |
def _make_search(self, df, emb_querry, embeddings):
|
|
|
|
| 53 |
# initialize a new index, using a HNSW index on Cosine Similarity
|
| 54 |
index = nmslib.init(method="hnsw", space="cosinesimil")
|
| 55 |
index.addDataPointBatch(embeddings)
|
|
@@ -73,6 +75,7 @@ class SentenceEncoder:
|
|
| 73 |
return pd.DataFrame(data)
|
| 74 |
|
| 75 |
def _add_relevant_columns(self, df, result):
|
|
|
|
| 76 |
# get categories
|
| 77 |
df["categories_parsed"] = (
|
| 78 |
df.categories.str.split()
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
def load_and_encode(self):
|
| 17 |
+
"""prepare data before running search querry"""
|
| 18 |
# load
|
| 19 |
df = self._load()
|
| 20 |
# encode
|
|
|
|
| 22 |
return df, model, embeddings
|
| 23 |
|
| 24 |
def transform(self, df, querry, model, embeddings):
|
| 25 |
+
"""main querry pipeline"""
|
| 26 |
# create_index
|
| 27 |
emb_querry = self._econde_querry(querry, model)
|
| 28 |
# search
|
|
|
|
| 51 |
return emb_querry
|
| 52 |
|
| 53 |
def _make_search(self, df, emb_querry, embeddings):
|
| 54 |
+
"""search for nearest K neighbours in the embedding space"""
|
| 55 |
# initialize a new index, using a HNSW index on Cosine Similarity
|
| 56 |
index = nmslib.init(method="hnsw", space="cosinesimil")
|
| 57 |
index.addDataPointBatch(embeddings)
|
|
|
|
| 75 |
return pd.DataFrame(data)
|
| 76 |
|
| 77 |
def _add_relevant_columns(self, df, result):
|
| 78 |
+
"""post processing"""
|
| 79 |
# get categories
|
| 80 |
df["categories_parsed"] = (
|
| 81 |
df.categories.str.split()
|