livctr commited on
Commit
d5f5799
·
1 Parent(s): 353f1ea

app first draft

Browse files
README.md CHANGED
@@ -6,7 +6,7 @@ Disclaimer: results are not 100% accurate and there is likely some bias to how p
6
 
7
  First, a list of authors are gathered from recent conference proceedings. A batched RAG pipeline is used to determine which persons are U.S. professors (unsure how accurate the LLM here is). This can be reproduced as follows:
8
 
9
- #### Repeat research until satisfactory
10
 
11
  ```python
12
  # Scrape top conferences for potential U.S.-based professors, ~45 mins
@@ -44,5 +44,13 @@ python -m data_pipeline.us_professor_verifier --batch_retrieve
44
  #### Extract embeddings for the relevant papers
45
  ```python
46
  # Fetch arxiv data and extract embeddings
47
- python -m data_pipeline.download_arxiv_kaggle
48
  ```
 
 
 
 
 
 
 
 
 
6
 
7
  First, a list of authors are gathered from recent conference proceedings. A batched RAG pipeline is used to determine which persons are U.S. professors (unsure how accurate the LLM here is). This can be reproduced as follows:
8
 
9
+ #### Repeat scrape until satisfactory
10
 
11
  ```python
12
  # Scrape top conferences for potential U.S.-based professors, ~45 mins
 
44
  #### Extract embeddings for the relevant papers
45
  ```python
46
  # Fetch arxiv data and extract embeddings
47
+ python -m data_pipeline.paper_embeddings_extractor
48
  ```
49
+
50
+ ### Run streamlit
51
+
52
+ ```python
53
+
54
+ streamlit run streamlit.py
55
+
56
+ ```
data_pipeline/download_arxiv_kaggle.py DELETED
@@ -1,281 +0,0 @@
1
- """Pulls papers from arxiv."""
2
- from collections import defaultdict
3
- from functools import partial
4
- from datetime import datetime
5
- import heapq
6
- import json
7
- import os
8
- from pathlib import Path
9
- import pickle
10
-
11
- from datasets import Dataset
12
- import kaggle
13
- import numpy as np
14
- import pandas as pd
15
- import torch
16
- import torch.nn.functional as F
17
- from tqdm import tqdm
18
- from transformers import AutoTokenizer, AutoModel
19
-
20
-
21
- arxiv_fname = "arxiv-metadata-oai-snapshot.json"
22
-
23
- def download_arxiv_data(path = Path(".")):
24
- """Downloads and unzips arxiv dataset from Kaggle into the `data` subdirectory of `path`."""
25
- dataset = "Cornell-University/arxiv"
26
- data_path = path/"data"
27
-
28
- if not any([arxiv_fname in file for file in os.listdir(data_path)]):
29
- kaggle.api.dataset_download_cli(dataset, path=data_path, unzip=True)
30
- else:
31
- print(f"Data already downloaded at {data_path/arxiv_fname}.")
32
- return data_path/arxiv_fname
33
-
34
- def get_lbl_from_name(names):
35
- """Tuple (last_name, first_name, middle_name) => String 'first_name [middle_name] last_name'."""
36
- return [
37
- name[1] + ' ' + name[0] if name[2] == '' \
38
- else name[1] + ' ' + name[2] + ' ' + name[0]
39
- for name in names
40
- ]
41
-
42
- def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=None):
43
- """Sifts through downloaded arxiv file to find ML-related papers.
44
-
45
- If `obtain_summary` is True, saves a pickled DataFrame to the same directory as
46
- the downloaded arxiv file with the name `arxiv_fname` + `-summary.pkl`.
47
-
48
- If `authors_of_interest` is not None, only save ML-related papers by those authors.
49
- """
50
- ml_path = str(arxiv_path).split('.')[0]+'-ml.json'
51
- summary_path = str(arxiv_path).split('.')[0]+'-summary.pkl'
52
-
53
- ml_cats = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'stat.ML']
54
-
55
- if obtain_summary and Path(ml_path).exists() and Path(summary_path).exists():
56
- print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
57
- print(f"Summary file {summary_path} already exists. Skipping.")
58
- return
59
- if not obtain_summary and Path(ml_path).exists():
60
- print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
61
- return
62
-
63
- if obtain_summary:
64
- gdf = {'categories': [], 'lv_date': []} # global data
65
-
66
- if authors_of_interest:
67
- authors_of_interest = set(authors_of_interest)
68
-
69
- # Load the JSON file line by line
70
- with open(arxiv_path, 'r') as f1, open(ml_path, 'w') as f2:
71
- for line in tqdm(f1):
72
- # Parse each line as JSON
73
- try:
74
- entry_data = json.loads(line)
75
- except json.JSONDecodeError:
76
- # Skip lines that cannot be parsed as JSON
77
- continue
78
-
79
- # check categories and last version in entry data
80
- if (
81
- obtain_summary
82
- and 'categories' in entry_data
83
- and 'versions' in entry_data
84
- and len(entry_data['versions'])
85
- and 'created' in entry_data['versions'][-1]
86
- ):
87
- gdf['categories'].append(entry_data['categories'])
88
- gdf['lv_date'].append(entry_data['versions'][-1]['created'])
89
-
90
- # ml data
91
- authors_on_paper = get_lbl_from_name(entry_data['authors_parsed'])
92
- if ('categories' in entry_data
93
- and (any(cat in entry_data['categories'] for cat in ml_cats))
94
- and (any(author in authors_of_interest for author in authors_on_paper))
95
- ):
96
- f2.write(line)
97
-
98
- if obtain_summary:
99
- gdf = pd.DataFrame(gdf)
100
- gdf['lv_date'] = pd.to_datetime(gdf['lv_date'])
101
- gdf = gdf.sort_values('lv_date', axis=0).reset_index(drop=True)
102
-
103
- cats = set()
104
- for cat_combo in gdf['categories'].unique():
105
- cat_combo.split(' ')
106
- cats.update(cat_combo.split(' '))
107
- print(f'Columnizing {len(cats)} categories. ')
108
- for cat in tqdm(cats):
109
- gdf[cat] = pd.arrays.SparseArray(gdf['categories'].str.contains(cat), fill_value=0, dtype=np.int8)
110
-
111
- # count number of categories item is associated with
112
- gdf['ncats'] = gdf['categories'].str.count(' ') + 1
113
-
114
- # write to pickle file
115
- with open(summary_path, 'wb') as f:
116
- pickle.dump(gdf, f)
117
-
118
- def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022, 10, 1)):
119
- """
120
- Returns a dictionary mapping U.S. professor names to a list of indices
121
- corresponding to their most recent papers in `data/arxiv-metadata-oai-snapshot-ml.json`.
122
- This function is necessary to specify the papers we are interested in for each
123
- professor (e.g., the most recent papers after cutoff)
124
-
125
- Parameters:
126
- - us_professors: A list of U.S. professor names to match against.
127
- - k: The number of most recent papers to keep for each professor, based on
128
- the first version upload date.
129
- - cutoff (datetime): Only considers papers published after this date
130
- (default: October 1, 2022).
131
-
132
- Returns:
133
- - dict: A dictionary where keys are professor names and values are lists of
134
- indices corresponding to their most recent papers.
135
- """
136
- # professors to tuple of (datetime, papers)
137
- p2p = defaultdict(list)
138
-
139
- with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
140
- line_nbr = 1
141
- while True:
142
- line = f.readline()
143
- if not line: break
144
-
145
- try:
146
- ml_data = json.loads(line)
147
- paper_authors = get_lbl_from_name(ml_data['authors_parsed'])
148
-
149
- # filter the same way as in `conference_scraper.py`
150
- # ignore solo-authored papers and papers with more than 20 authors
151
- if len(paper_authors) == 1 or len(paper_authors) > 20:
152
- continue
153
-
154
- try:
155
- dt = datetime.strptime(ml_data["versions"][0]["created"], '%a, %d %b %Y %H:%M:%S %Z')
156
- if dt < cutoff:
157
- continue
158
- except (KeyError, ValueError) as e:
159
- print(f"Failed to parse date: {ml_data}")
160
- dt = datetime(2000, 1, 1) # before cutoff date
161
-
162
- # consider if professor is first-author since we now care about semantics
163
- for paper_author in paper_authors:
164
- if paper_author in us_professors:
165
- # make a connection
166
- heapq.heappush(p2p[paper_author], (dt, line_nbr))
167
- if len(p2p[paper_author]) > k:
168
- heapq.heappop(p2p[paper_author])
169
- except:
170
- print(f"{line}")
171
- line_nbr += 1
172
- return p2p
173
-
174
- def gen(p2p):
175
- values = p2p.values()
176
- relevant_lines = set()
177
- for value in values:
178
- relevant_lines.update([v[1] for v in value])
179
- relevant_lines = sorted(list(relevant_lines))
180
-
181
- idx = 0
182
- with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
183
- line_nbr = 1
184
- while idx < len(relevant_lines):
185
- line = f.readline()
186
- if not line: break
187
-
188
- if line_nbr == relevant_lines[idx]:
189
- data = json.loads(line)
190
- yield {"line_nbr": line_nbr,
191
- "id": data["id"],
192
- "title": data["title"],
193
- "abstract": data["abstract"],
194
- "authors": data["authors_parsed"]}
195
- idx += 1
196
-
197
- line_nbr += 1
198
-
199
-
200
- class EmbeddingProcessor:
201
- def __init__(self, model_name: str, custom_model_name: str, device: str = "cuda"):
202
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
203
- self.model = AutoModel.from_pretrained(custom_model_name)
204
- self.device = torch.device(device)
205
- self.model.to(self.device)
206
- torch.cuda.empty_cache()
207
-
208
- @staticmethod
209
- def mean_pooling(model_output, attention_mask):
210
- # First element of model_output contains all token embeddings
211
- token_embeddings = model_output[0]
212
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
213
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
214
-
215
- def get_embeddings(self, batch):
216
- title_tkn, abstract_tkn = " [TITLE] ", " [ABSTRACT] "
217
- titles = batch["title"]
218
- abstracts = batch["abstract"]
219
-
220
- texts = [title_tkn + t + abstract_tkn + a for t, a in zip(titles, abstracts)]
221
-
222
- # Tokenize sentences
223
- encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
224
- encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
225
-
226
- # Compute token embeddings
227
- with torch.no_grad():
228
- model_output = self.model(**encoded_input)
229
-
230
- # Perform pooling
231
- embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
232
-
233
- # Normalize embeddings
234
- embeddings = F.normalize(embeddings, p=2, dim=1)
235
-
236
- # Move embeddings to CPU and convert to list
237
- return embeddings.cpu().numpy().tolist()
238
-
239
- def process_dataset(self, dataset_path: str, save_path: str, batch_size: int = 128):
240
- # Load dataset
241
- ds = Dataset.load_from_disk(dataset_path)
242
-
243
- # Compute embeddings and add as a new column
244
- ds_with_embeddings = ds.map(lambda x: {"embeddings": self.get_embeddings(x)}, batched=True, batch_size=batch_size)
245
-
246
- # Save the updated dataset
247
- save_path = save_path
248
- ds_with_embeddings.save_to_disk(save_path)
249
- print(f"Dataset with embeddings saved to {save_path}")
250
-
251
-
252
- def main():
253
- """Downloads arxiv data and extract embeddings for papers."""
254
- print("Downloading data...")
255
- arxiv_path = download_arxiv_data()
256
- with open('data/professor/us_professor.json', 'r') as f:
257
- authors_of_interest = json.load(f)
258
- authors_of_interest = [author['name'] for author in authors_of_interest]
259
- print("Filtering data for ML papers...")
260
- filter_arxiv_for_ml(arxiv_path, authors_of_interest=authors_of_interest)
261
-
262
- # professor to list of paper indices
263
- paper_data_path = "data/paper_embeddings/paper_data"
264
- print("Saving data to disk at " + paper_data_path)
265
- p2p = get_professors_and_relevant_papers(authors_of_interest)
266
- ds = Dataset.from_generator(partial(gen, p2p))
267
- ds.save_to_disk(paper_data_path)
268
-
269
- print("Extracting embeddings (use GPU if possible)...")
270
- # paper embeddings
271
- save_path = "data/paper_embeddings/all-mpnet-base-v2-embds"
272
- # Initialize the embedding processor with model names
273
- embedding_processor = EmbeddingProcessor(
274
- model_name='sentence-transformers/all-mpnet-base-v2',
275
- custom_model_name='salsabiilashifa11/sbert-paper'
276
- )
277
- # Process dataset and save with embeddings
278
- embedding_processor.process_dataset(paper_data_path, save_path, batch_size=128)
279
-
280
- if __name__ == "__main__":
281
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_pipeline/requirements.txt CHANGED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4==4.12.3
2
+ datasets==3.0.1
3
+ kaggle==1.6.17
4
+ langchain==0.3.4
5
+ langchain_core==0.3.12
6
+ langchain_together==0.2.0
7
+ numpy
8
+ openai==1.52.0
9
+ pandas
10
+ python-dotenv==1.0.1
11
+ regex==2024.9.11
12
+ Requests==2.32.3
13
+ torch
14
+ tqdm==4.66.4
15
+ transformers==4.45.2
data_pipeline/us_professor_verifier.py CHANGED
@@ -452,9 +452,26 @@ def batch_process_llm_output(client, batches):
452
 
453
  with open("data/professor/us_professor.json", 'w') as file:
454
  json.dump(us_professor_profiles, file, indent=4)
 
455
  with open("data/professor/not_us_professor.json", 'w') as file:
456
  json.dump(not_us_professor_profiles, file, indent=4)
457
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  def main():
459
  import argparse
460
 
@@ -505,6 +522,7 @@ def main():
505
  with open(f"{prompt_data_path_prefix}_batches.pkl", "rb") as f:
506
  batches = pickle.load(f)
507
  batch_process_llm_output(client, batches)
 
508
  else:
509
  raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")
510
 
 
452
 
453
  with open("data/professor/us_professor.json", 'w') as file:
454
  json.dump(us_professor_profiles, file, indent=4)
455
+
456
  with open("data/professor/not_us_professor.json", 'w') as file:
457
  json.dump(not_us_professor_profiles, file, indent=4)
458
 
459
+ def create_frontend_data(us_professor_profiles_path="data/professor/us_professor.json"):
460
+ with open(us_professor_profiles_path, 'r') as file:
461
+ us_professor_profiles = json.load(file)
462
+
463
+ professors_dict = {
464
+ professor['name']: {
465
+ 'title': professor['title'],
466
+ 'department': professor['department'],
467
+ 'university': professor['university']
468
+ }
469
+ for professor in us_professor_profiles
470
+ }
471
+
472
+ with open("data/frontend_data/us_professor.json", 'w') as file:
473
+ json.dump(professors_dict, file)
474
+
475
  def main():
476
  import argparse
477
 
 
522
  with open(f"{prompt_data_path_prefix}_batches.pkl", "rb") as f:
523
  batches = pickle.load(f)
524
  batch_process_llm_output(client, batches)
525
+ create_frontend_data()
526
  else:
527
  raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")
528
 
requirements.txt CHANGED
@@ -1,12 +1 @@
1
- python-dotenv
2
- openai
3
- langchain-together
4
- lxml
5
-
6
- einops
7
- torch and everything else
8
- datasets
9
- transformers
10
-
11
- datasets
12
- transformers
 
1
+ streamlit