gabriel lopez commited on
Commit
e84989e
·
1 Parent(s): 44b6718

cosmetics

Browse files
Files changed (2) hide show
  1. app.py +1 -0
  2. arxiv_tool/core.py +4 -1
app.py CHANGED
@@ -13,6 +13,7 @@ EXAMPLES = [
13
  ARTICLE = r"""<center>
14
  This application uses Sentence-BERT embeddings.
15
  Sentence Embedding is achieved here via Siamese BERT-Networks from <a href=https://arxiv.org/abs/1908.10084>this paper</a> <br>
 
16
  Done by dr. Gabriel Lopez<br>
17
  For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a>
18
  </center>"""
 
13
  ARTICLE = r"""<center>
14
  This application uses Sentence-BERT embeddings.
15
  Sentence Embedding is achieved here via Siamese BERT-Networks from <a href=https://arxiv.org/abs/1908.10084>this paper</a> <br>
16
+ After embedding, encoded papers are projected into the unit sphere and a nearest neighbours search is done to extract best matching results.
17
  Done by dr. Gabriel Lopez<br>
18
  For more please visit: <a href='https://sites.google.com/view/dr-gabriel-lopez/home'>My Page</a>
19
  </center>"""
arxiv_tool/core.py CHANGED
@@ -14,6 +14,7 @@ class SentenceEncoder:
14
  """
15
 
16
  def load_and_encode(self):
 
17
  # load
18
  df = self._load()
19
  # encode
@@ -21,7 +22,7 @@ class SentenceEncoder:
21
  return df, model, embeddings
22
 
23
  def transform(self, df, querry, model, embeddings):
24
- """ """
25
  # create_index
26
  emb_querry = self._econde_querry(querry, model)
27
  # search
@@ -50,6 +51,7 @@ class SentenceEncoder:
50
  return emb_querry
51
 
52
  def _make_search(self, df, emb_querry, embeddings):
 
53
  # initialize a new index, using a HNSW index on Cosine Similarity
54
  index = nmslib.init(method="hnsw", space="cosinesimil")
55
  index.addDataPointBatch(embeddings)
@@ -73,6 +75,7 @@ class SentenceEncoder:
73
  return pd.DataFrame(data)
74
 
75
  def _add_relevant_columns(self, df, result):
 
76
  # get categories
77
  df["categories_parsed"] = (
78
  df.categories.str.split()
 
14
  """
15
 
16
  def load_and_encode(self):
17
+ """prepare data before running search querry"""
18
  # load
19
  df = self._load()
20
  # encode
 
22
  return df, model, embeddings
23
 
24
  def transform(self, df, querry, model, embeddings):
25
+ """main querry pipeline"""
26
  # create_index
27
  emb_querry = self._econde_querry(querry, model)
28
  # search
 
51
  return emb_querry
52
 
53
  def _make_search(self, df, emb_querry, embeddings):
54
+ """search for nearest K neighbours in the embedding space"""
55
  # initialize a new index, using a HNSW index on Cosine Similarity
56
  index = nmslib.init(method="hnsw", space="cosinesimil")
57
  index.addDataPointBatch(embeddings)
 
75
  return pd.DataFrame(data)
76
 
77
  def _add_relevant_columns(self, df, result):
78
+ """post processing"""
79
  # get categories
80
  df["categories_parsed"] = (
81
  df.categories.str.split()