Spaces:
Sleeping
Sleeping
arrow -> csv ita data
Browse files- .gitattributes +5 -0
- core/recommender.py +11 -6
- data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json +0 -3
- data/frontend_data/all-mpnet-base-v2-embds/{id_title_author/dataset_info.json → ita.csv} +2 -2
- data_pipeline/config.py +1 -1
- data_pipeline/paper_embeddings_extractor.py +32 -24
- requirements.txt +1 -1
.gitattributes
CHANGED
|
@@ -1,3 +1,8 @@
|
|
| 1 |
data/frontend_data/ filter=lfs diff=lfs merge=lfs -text
|
| 2 |
data/frontend_data/* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
data/frontend_data/**/* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
data/frontend_data/ filter=lfs diff=lfs merge=lfs -text
|
| 2 |
data/frontend_data/* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
data/frontend_data/**/* filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
data/frontend_data/us_professor.json filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
data/frontend_data/all-mpnet-base-v2-embds/weights.pt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/dataset_info.json filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json filter=lfs diff=lfs merge=lfs -text
|
core/recommender.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from collections import Counter, defaultdict
|
| 2 |
import json
|
| 3 |
|
| 4 |
-
|
| 5 |
import torch
|
| 6 |
import torch.nn.functional as F
|
| 7 |
from transformers import AutoTokenizer, AutoModel
|
|
@@ -72,7 +72,7 @@ class Recommender:
|
|
| 72 |
frontend_us_professor_path: str = DataPaths.FRONTEND_PROF_PATH,
|
| 73 |
):
|
| 74 |
self.embedding_processor = embedding_processor
|
| 75 |
-
self.ita =
|
| 76 |
self.embds = torch.load(weights_path, weights_only=True)
|
| 77 |
# dictionary with professor names as keys and their metadata as values
|
| 78 |
with open(frontend_us_professor_path, 'r') as f:
|
|
@@ -87,8 +87,9 @@ class Recommender:
|
|
| 87 |
|
| 88 |
def get_recommended_data(self, top_indices: torch.Tensor):
|
| 89 |
"""Returns a list of dictionaries with professors corresponding to their information."""
|
| 90 |
-
selected = self.ita.
|
| 91 |
-
|
|
|
|
| 92 |
professors = [prof for profs in professors for prof in profs]
|
| 93 |
|
| 94 |
# rank professors first by number of times appeared in the list
|
|
@@ -98,8 +99,12 @@ class Recommender:
|
|
| 98 |
|
| 99 |
# professor to IDs
|
| 100 |
professor2ids = defaultdict(list)
|
| 101 |
-
for pid_, pt, pauthors in zip(
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
professor2ids[prof].append((pid_, pt))
|
| 104 |
|
| 105 |
# Build professor metadata
|
|
|
|
| 1 |
from collections import Counter, defaultdict
|
| 2 |
import json
|
| 3 |
|
| 4 |
+
import pandas as pd
|
| 5 |
import torch
|
| 6 |
import torch.nn.functional as F
|
| 7 |
from transformers import AutoTokenizer, AutoModel
|
|
|
|
| 72 |
frontend_us_professor_path: str = DataPaths.FRONTEND_PROF_PATH,
|
| 73 |
):
|
| 74 |
self.embedding_processor = embedding_processor
|
| 75 |
+
self.ita = pd.read_csv(ita_path)
|
| 76 |
self.embds = torch.load(weights_path, weights_only=True)
|
| 77 |
# dictionary with professor names as keys and their metadata as values
|
| 78 |
with open(frontend_us_professor_path, 'r') as f:
|
|
|
|
| 87 |
|
| 88 |
def get_recommended_data(self, top_indices: torch.Tensor):
|
| 89 |
"""Returns a list of dictionaries with professors corresponding to their information."""
|
| 90 |
+
selected = self.ita.iloc[top_indices]
|
| 91 |
+
|
| 92 |
+
professors = [x.split("|-|") for x in selected["authors"]]
|
| 93 |
professors = [prof for profs in professors for prof in profs]
|
| 94 |
|
| 95 |
# rank professors first by number of times appeared in the list
|
|
|
|
| 99 |
|
| 100 |
# professor to IDs
|
| 101 |
professor2ids = defaultdict(list)
|
| 102 |
+
for pid_, pt, pauthors in zip(
|
| 103 |
+
selected['id'].tolist(),
|
| 104 |
+
selected['title'].tolist(),
|
| 105 |
+
selected['authors'].tolist()
|
| 106 |
+
):
|
| 107 |
+
for prof in pauthors.split("|-|"):
|
| 108 |
professor2ids[prof].append((pid_, pt))
|
| 109 |
|
| 110 |
# Build professor metadata
|
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:7961828744086f5b98697879c399db414cbf8e921ae858c62d61ab8f79ecba59
|
| 3 |
-
size 250
|
|
|
|
|
|
|
|
|
|
|
|
data/frontend_data/all-mpnet-base-v2-embds/{id_title_author/dataset_info.json → ita.csv}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19672f2088eef5dc28f75010f266bdbccfa5c85b02f1ec0728fad5bae1d8b44d
|
| 3 |
+
size 1424458
|
data_pipeline/config.py
CHANGED
|
@@ -28,7 +28,7 @@ class DataPaths:
|
|
| 28 |
FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
|
| 29 |
FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
|
| 30 |
FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
|
| 31 |
-
FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, '
|
| 32 |
FRONTEND_WEIGHTS_PATH = os.path.join(FRONTEND_EMBD_PATH, 'weights.pt')
|
| 33 |
|
| 34 |
# create BASE_DIR LOG_DIR FRONTEND_DIR PROF_DIR CONFERENCE_DIR PAPER_DIR
|
|
|
|
| 28 |
FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
|
| 29 |
FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
|
| 30 |
FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
|
| 31 |
+
FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'ita.csv')
|
| 32 |
FRONTEND_WEIGHTS_PATH = os.path.join(FRONTEND_EMBD_PATH, 'weights.pt')
|
| 33 |
|
| 34 |
# create BASE_DIR LOG_DIR FRONTEND_DIR PROF_DIR CONFERENCE_DIR PAPER_DIR
|
data_pipeline/paper_embeddings_extractor.py
CHANGED
|
@@ -214,37 +214,45 @@ def main():
|
|
| 214 |
"""Downloads arxiv data and extract embeddings for papers."""
|
| 215 |
|
| 216 |
### Download and filter for ML papers written by U.S. professors ###
|
| 217 |
-
print("Downloading data...")
|
| 218 |
-
download_arxiv_data()
|
| 219 |
-
with open(DataPaths.US_PROF_PATH, 'r') as f:
|
| 220 |
-
|
| 221 |
-
authors_of_interest = [author['name'] for author in authors_of_interest]
|
| 222 |
-
print("Filtering data for ML papers...")
|
| 223 |
-
filter_arxiv_for_ml(authors_of_interest=authors_of_interest)
|
| 224 |
-
|
| 225 |
-
### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
|
| 226 |
-
print("Saving paper data to disk at " + DataPaths.PAPER_DATA_PATH)
|
| 227 |
-
p2p = get_professors_and_relevant_papers(authors_of_interest)
|
| 228 |
-
ds = Dataset.from_generator(partial(gen, p2p))
|
| 229 |
-
ds.save_to_disk(DataPaths.PAPER_DATA_PATH)
|
| 230 |
-
|
| 231 |
-
### Extract paper embeddings ###
|
| 232 |
-
print("Extracting embeddings (use GPU if possible)...")
|
| 233 |
-
# Initialize the embedding processor with model names
|
| 234 |
-
embedding_processor = EmbeddingProcessor(
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
)
|
| 238 |
-
# Process dataset and save with embeddings
|
| 239 |
-
embedding_processor.process_dataset(DataPaths.PAPER_DATA_PATH, DataPaths.EMBD_PATH, batch_size=128)
|
| 240 |
|
| 241 |
### Create front-end data ###
|
| 242 |
|
| 243 |
# Filter ds for paper title, id, authors, and embedding
|
| 244 |
embds = Dataset.load_from_disk(DataPaths.EMBD_PATH)
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
# save id and title to disk
|
| 247 |
-
embds.select_columns(['id', 'title', 'authors']).
|
| 248 |
# save embeddings as torch tensor
|
| 249 |
embds_weights = torch.Tensor(embds['embeddings'])
|
| 250 |
torch.save(embds_weights, DataPaths.FRONTEND_WEIGHTS_PATH)
|
|
|
|
| 214 |
"""Downloads arxiv data and extract embeddings for papers."""
|
| 215 |
|
| 216 |
### Download and filter for ML papers written by U.S. professors ###
|
| 217 |
+
# print("Downloading data...")
|
| 218 |
+
# download_arxiv_data()
|
| 219 |
+
# with open(DataPaths.US_PROF_PATH, 'r') as f:
|
| 220 |
+
# authors_of_interest = json.load(f)
|
| 221 |
+
# authors_of_interest = [author['name'] for author in authors_of_interest]
|
| 222 |
+
# print("Filtering data for ML papers...")
|
| 223 |
+
# filter_arxiv_for_ml(authors_of_interest=authors_of_interest)
|
| 224 |
+
|
| 225 |
+
# ### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
|
| 226 |
+
# print("Saving paper data to disk at " + DataPaths.PAPER_DATA_PATH)
|
| 227 |
+
# p2p = get_professors_and_relevant_papers(authors_of_interest)
|
| 228 |
+
# ds = Dataset.from_generator(partial(gen, p2p))
|
| 229 |
+
# ds.save_to_disk(DataPaths.PAPER_DATA_PATH)
|
| 230 |
+
|
| 231 |
+
# ### Extract paper embeddings ###
|
| 232 |
+
# print("Extracting embeddings (use GPU if possible)...")
|
| 233 |
+
# # Initialize the embedding processor with model names
|
| 234 |
+
# embedding_processor = EmbeddingProcessor(
|
| 235 |
+
# model_name='sentence-transformers/all-mpnet-base-v2',
|
| 236 |
+
# custom_model_name='salsabiilashifa11/sbert-paper'
|
| 237 |
+
# )
|
| 238 |
+
# # Process dataset and save with embeddings
|
| 239 |
+
# embedding_processor.process_dataset(DataPaths.PAPER_DATA_PATH, DataPaths.EMBD_PATH, batch_size=128)
|
| 240 |
|
| 241 |
### Create front-end data ###
|
| 242 |
|
| 243 |
# Filter ds for paper title, id, authors, and embedding
|
| 244 |
embds = Dataset.load_from_disk(DataPaths.EMBD_PATH)
|
| 245 |
|
| 246 |
+
def join_authors(x):
|
| 247 |
+
x['authors'] = "|-|".join(x['authors'])
|
| 248 |
+
return x
|
| 249 |
+
|
| 250 |
+
import pdb ; pdb.set_trace()
|
| 251 |
+
|
| 252 |
+
embds = embds.map(join_authors)
|
| 253 |
+
|
| 254 |
# save id and title to disk
|
| 255 |
+
embds.select_columns(['id', 'title', 'authors']).to_csv(DataPaths.FRONTEND_ITA_PATH)
|
| 256 |
# save embeddings as torch tensor
|
| 257 |
embds_weights = torch.Tensor(embds['embeddings'])
|
| 258 |
torch.save(embds_weights, DataPaths.FRONTEND_WEIGHTS_PATH)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
streamlit
|
| 3 |
torch
|
| 4 |
transformers
|
|
|
|
| 1 |
+
pandas
|
| 2 |
streamlit
|
| 3 |
torch
|
| 4 |
transformers
|