livctr commited on
Commit
0eab751
·
1 Parent(s): 7da737a

arrow -> csv ita data

Browse files
.gitattributes CHANGED
@@ -1,3 +1,8 @@
1
  data/frontend_data/ filter=lfs diff=lfs merge=lfs -text
2
  data/frontend_data/* filter=lfs diff=lfs merge=lfs -text
3
  data/frontend_data/**/* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
1
  data/frontend_data/ filter=lfs diff=lfs merge=lfs -text
2
  data/frontend_data/* filter=lfs diff=lfs merge=lfs -text
3
  data/frontend_data/**/* filter=lfs diff=lfs merge=lfs -text
4
+ data/frontend_data/us_professor.json filter=lfs diff=lfs merge=lfs -text
5
+ data/frontend_data/all-mpnet-base-v2-embds/weights.pt filter=lfs diff=lfs merge=lfs -text
6
+ data/frontend_data/all-mpnet-base-v2-embds/id_title_author/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
7
+ data/frontend_data/all-mpnet-base-v2-embds/id_title_author/dataset_info.json filter=lfs diff=lfs merge=lfs -text
8
+ data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json filter=lfs diff=lfs merge=lfs -text
core/recommender.py CHANGED
@@ -1,7 +1,7 @@
1
  from collections import Counter, defaultdict
2
  import json
3
 
4
- from datasets import Dataset
5
  import torch
6
  import torch.nn.functional as F
7
  from transformers import AutoTokenizer, AutoModel
@@ -72,7 +72,7 @@ class Recommender:
72
  frontend_us_professor_path: str = DataPaths.FRONTEND_PROF_PATH,
73
  ):
74
  self.embedding_processor = embedding_processor
75
- self.ita = Dataset.load_from_disk(ita_path)
76
  self.embds = torch.load(weights_path, weights_only=True)
77
  # dictionary with professor names as keys and their metadata as values
78
  with open(frontend_us_professor_path, 'r') as f:
@@ -87,8 +87,9 @@ class Recommender:
87
 
88
  def get_recommended_data(self, top_indices: torch.Tensor):
89
  """Returns a list of dictionaries with professors corresponding to their information."""
90
- selected = self.ita.select(top_indices)
91
- professors = selected["authors"]
 
92
  professors = [prof for profs in professors for prof in profs]
93
 
94
  # rank professors first by number of times appeared in the list
@@ -98,8 +99,12 @@ class Recommender:
98
 
99
  # professor to IDs
100
  professor2ids = defaultdict(list)
101
- for pid_, pt, pauthors in zip(selected['id'], selected['title'], selected['authors']):
102
- for prof in pauthors:
 
 
 
 
103
  professor2ids[prof].append((pid_, pt))
104
 
105
  # Build professor metadata
 
1
  from collections import Counter, defaultdict
2
  import json
3
 
4
+ import pandas as pd
5
  import torch
6
  import torch.nn.functional as F
7
  from transformers import AutoTokenizer, AutoModel
 
72
  frontend_us_professor_path: str = DataPaths.FRONTEND_PROF_PATH,
73
  ):
74
  self.embedding_processor = embedding_processor
75
+ self.ita = pd.read_csv(ita_path)
76
  self.embds = torch.load(weights_path, weights_only=True)
77
  # dictionary with professor names as keys and their metadata as values
78
  with open(frontend_us_professor_path, 'r') as f:
 
87
 
88
  def get_recommended_data(self, top_indices: torch.Tensor):
89
  """Returns a list of dictionaries with professors corresponding to their information."""
90
+ selected = self.ita.iloc[top_indices]
91
+
92
+ professors = [x.split("|-|") for x in selected["authors"]]
93
  professors = [prof for profs in professors for prof in profs]
94
 
95
  # rank professors first by number of times appeared in the list
 
99
 
100
  # professor to IDs
101
  professor2ids = defaultdict(list)
102
+ for pid_, pt, pauthors in zip(
103
+ selected['id'].tolist(),
104
+ selected['title'].tolist(),
105
+ selected['authors'].tolist()
106
+ ):
107
+ for prof in pauthors.split("|-|"):
108
  professor2ids[prof].append((pid_, pt))
109
 
110
  # Build professor metadata
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7961828744086f5b98697879c399db414cbf8e921ae858c62d61ab8f79ecba59
3
- size 250
 
 
 
 
data/frontend_data/all-mpnet-base-v2-embds/{id_title_author/dataset_info.json → ita.csv} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a13329a3e3ffb5721d11f0e2af55847ed65dd9cbd5fb0f2d85984312c0b8217a
3
- size 810
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19672f2088eef5dc28f75010f266bdbccfa5c85b02f1ec0728fad5bae1d8b44d
3
+ size 1424458
data_pipeline/config.py CHANGED
@@ -28,7 +28,7 @@ class DataPaths:
28
  FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
29
  FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
30
  FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
31
- FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')
32
  FRONTEND_WEIGHTS_PATH = os.path.join(FRONTEND_EMBD_PATH, 'weights.pt')
33
 
34
  # create BASE_DIR LOG_DIR FRONTEND_DIR PROF_DIR CONFERENCE_DIR PAPER_DIR
 
28
  FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
29
  FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
30
  FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
31
+ FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'ita.csv')
32
  FRONTEND_WEIGHTS_PATH = os.path.join(FRONTEND_EMBD_PATH, 'weights.pt')
33
 
34
  # create BASE_DIR LOG_DIR FRONTEND_DIR PROF_DIR CONFERENCE_DIR PAPER_DIR
data_pipeline/paper_embeddings_extractor.py CHANGED
@@ -214,37 +214,45 @@ def main():
214
  """Downloads arxiv data and extract embeddings for papers."""
215
 
216
  ### Download and filter for ML papers written by U.S. professors ###
217
- print("Downloading data...")
218
- download_arxiv_data()
219
- with open(DataPaths.US_PROF_PATH, 'r') as f:
220
- authors_of_interest = json.load(f)
221
- authors_of_interest = [author['name'] for author in authors_of_interest]
222
- print("Filtering data for ML papers...")
223
- filter_arxiv_for_ml(authors_of_interest=authors_of_interest)
224
-
225
- ### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
226
- print("Saving paper data to disk at " + DataPaths.PAPER_DATA_PATH)
227
- p2p = get_professors_and_relevant_papers(authors_of_interest)
228
- ds = Dataset.from_generator(partial(gen, p2p))
229
- ds.save_to_disk(DataPaths.PAPER_DATA_PATH)
230
-
231
- ### Extract paper embeddings ###
232
- print("Extracting embeddings (use GPU if possible)...")
233
- # Initialize the embedding processor with model names
234
- embedding_processor = EmbeddingProcessor(
235
- model_name='sentence-transformers/all-mpnet-base-v2',
236
- custom_model_name='salsabiilashifa11/sbert-paper'
237
- )
238
- # Process dataset and save with embeddings
239
- embedding_processor.process_dataset(DataPaths.PAPER_DATA_PATH, DataPaths.EMBD_PATH, batch_size=128)
240
 
241
  ### Create front-end data ###
242
 
243
  # Filter ds for paper title, id, authors, and embedding
244
  embds = Dataset.load_from_disk(DataPaths.EMBD_PATH)
245
 
 
 
 
 
 
 
 
 
246
  # save id and title to disk
247
- embds.select_columns(['id', 'title', 'authors']).save_to_disk(DataPaths.FRONTEND_ITA_PATH)
248
  # save embeddings as torch tensor
249
  embds_weights = torch.Tensor(embds['embeddings'])
250
  torch.save(embds_weights, DataPaths.FRONTEND_WEIGHTS_PATH)
 
214
  """Downloads arxiv data and extract embeddings for papers."""
215
 
216
  ### Download and filter for ML papers written by U.S. professors ###
217
+ # print("Downloading data...")
218
+ # download_arxiv_data()
219
+ # with open(DataPaths.US_PROF_PATH, 'r') as f:
220
+ # authors_of_interest = json.load(f)
221
+ # authors_of_interest = [author['name'] for author in authors_of_interest]
222
+ # print("Filtering data for ML papers...")
223
+ # filter_arxiv_for_ml(authors_of_interest=authors_of_interest)
224
+
225
+ # ### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
226
+ # print("Saving paper data to disk at " + DataPaths.PAPER_DATA_PATH)
227
+ # p2p = get_professors_and_relevant_papers(authors_of_interest)
228
+ # ds = Dataset.from_generator(partial(gen, p2p))
229
+ # ds.save_to_disk(DataPaths.PAPER_DATA_PATH)
230
+
231
+ # ### Extract paper embeddings ###
232
+ # print("Extracting embeddings (use GPU if possible)...")
233
+ # # Initialize the embedding processor with model names
234
+ # embedding_processor = EmbeddingProcessor(
235
+ # model_name='sentence-transformers/all-mpnet-base-v2',
236
+ # custom_model_name='salsabiilashifa11/sbert-paper'
237
+ # )
238
+ # # Process dataset and save with embeddings
239
+ # embedding_processor.process_dataset(DataPaths.PAPER_DATA_PATH, DataPaths.EMBD_PATH, batch_size=128)
240
 
241
  ### Create front-end data ###
242
 
243
  # Filter ds for paper title, id, authors, and embedding
244
  embds = Dataset.load_from_disk(DataPaths.EMBD_PATH)
245
 
246
+ def join_authors(x):
247
+ x['authors'] = "|-|".join(x['authors'])
248
+ return x
249
+
250
+ import pdb ; pdb.set_trace()
251
+
252
+ embds = embds.map(join_authors)
253
+
254
  # save id and title to disk
255
+ embds.select_columns(['id', 'title', 'authors']).to_csv(DataPaths.FRONTEND_ITA_PATH)
256
  # save embeddings as torch tensor
257
  embds_weights = torch.Tensor(embds['embeddings'])
258
  torch.save(embds_weights, DataPaths.FRONTEND_WEIGHTS_PATH)
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- datasets
2
  streamlit
3
  torch
4
  transformers
 
1
+ pandas
2
  streamlit
3
  torch
4
  transformers