Spaces:
Sleeping
Sleeping
app first draft
Browse files- README.md +10 -2
- data_pipeline/download_arxiv_kaggle.py +0 -281
- data_pipeline/requirements.txt +15 -0
- data_pipeline/us_professor_verifier.py +18 -0
- requirements.txt +1 -12
README.md
CHANGED
|
@@ -6,7 +6,7 @@ Disclaimer: results are not 100% accurate and there is likely some bias to how p
|
|
| 6 |
|
| 7 |
First, a list of authors are gathered from recent conference proceedings. A batched RAG pipeline is used to determine which persons are U.S. professors (unsure how accurate the LLM here is). This can be reproduced as follows:
|
| 8 |
|
| 9 |
-
#### Repeat
|
| 10 |
|
| 11 |
```python
|
| 12 |
# Scrape top conferences for potential U.S.-based professors, ~45 mins
|
|
@@ -44,5 +44,13 @@ python -m data_pipeline.us_professor_verifier --batch_retrieve
|
|
| 44 |
#### Extract embeddings for the relevant papers
|
| 45 |
```python
|
| 46 |
# Fetch arxiv data and extract embeddings
|
| 47 |
-
python -m data_pipeline.
|
| 48 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
First, a list of authors are gathered from recent conference proceedings. A batched RAG pipeline is used to determine which persons are U.S. professors (unsure how accurate the LLM here is). This can be reproduced as follows:
|
| 8 |
|
| 9 |
+
#### Repeat scrape until satisfactory
|
| 10 |
|
| 11 |
```python
|
| 12 |
# Scrape top conferences for potential U.S.-based professors, ~45 mins
|
|
|
|
| 44 |
#### Extract embeddings for the relevant papers
|
| 45 |
```python
|
| 46 |
# Fetch arxiv data and extract embeddings
|
| 47 |
+
python -m data_pipeline.paper_embeddings_extractor
|
| 48 |
```
|
| 49 |
+
|
| 50 |
+
### Run streamlit
|
| 51 |
+
|
| 52 |
+
```python
|
| 53 |
+
|
| 54 |
+
streamlit run streamlit.py
|
| 55 |
+
|
| 56 |
+
```
|
data_pipeline/download_arxiv_kaggle.py
DELETED
|
@@ -1,281 +0,0 @@
|
|
| 1 |
-
"""Pulls papers from arxiv."""
|
| 2 |
-
from collections import defaultdict
|
| 3 |
-
from functools import partial
|
| 4 |
-
from datetime import datetime
|
| 5 |
-
import heapq
|
| 6 |
-
import json
|
| 7 |
-
import os
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
import pickle
|
| 10 |
-
|
| 11 |
-
from datasets import Dataset
|
| 12 |
-
import kaggle
|
| 13 |
-
import numpy as np
|
| 14 |
-
import pandas as pd
|
| 15 |
-
import torch
|
| 16 |
-
import torch.nn.functional as F
|
| 17 |
-
from tqdm import tqdm
|
| 18 |
-
from transformers import AutoTokenizer, AutoModel
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
arxiv_fname = "arxiv-metadata-oai-snapshot.json"
|
| 22 |
-
|
| 23 |
-
def download_arxiv_data(path = Path(".")):
|
| 24 |
-
"""Downloads and unzips arxiv dataset from Kaggle into the `data` subdirectory of `path`."""
|
| 25 |
-
dataset = "Cornell-University/arxiv"
|
| 26 |
-
data_path = path/"data"
|
| 27 |
-
|
| 28 |
-
if not any([arxiv_fname in file for file in os.listdir(data_path)]):
|
| 29 |
-
kaggle.api.dataset_download_cli(dataset, path=data_path, unzip=True)
|
| 30 |
-
else:
|
| 31 |
-
print(f"Data already downloaded at {data_path/arxiv_fname}.")
|
| 32 |
-
return data_path/arxiv_fname
|
| 33 |
-
|
| 34 |
-
def get_lbl_from_name(names):
|
| 35 |
-
"""Tuple (last_name, first_name, middle_name) => String 'first_name [middle_name] last_name'."""
|
| 36 |
-
return [
|
| 37 |
-
name[1] + ' ' + name[0] if name[2] == '' \
|
| 38 |
-
else name[1] + ' ' + name[2] + ' ' + name[0]
|
| 39 |
-
for name in names
|
| 40 |
-
]
|
| 41 |
-
|
| 42 |
-
def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=None):
|
| 43 |
-
"""Sifts through downloaded arxiv file to find ML-related papers.
|
| 44 |
-
|
| 45 |
-
If `obtain_summary` is True, saves a pickled DataFrame to the same directory as
|
| 46 |
-
the downloaded arxiv file with the name `arxiv_fname` + `-summary.pkl`.
|
| 47 |
-
|
| 48 |
-
If `authors_of_interest` is not None, only save ML-related papers by those authors.
|
| 49 |
-
"""
|
| 50 |
-
ml_path = str(arxiv_path).split('.')[0]+'-ml.json'
|
| 51 |
-
summary_path = str(arxiv_path).split('.')[0]+'-summary.pkl'
|
| 52 |
-
|
| 53 |
-
ml_cats = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'stat.ML']
|
| 54 |
-
|
| 55 |
-
if obtain_summary and Path(ml_path).exists() and Path(summary_path).exists():
|
| 56 |
-
print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
|
| 57 |
-
print(f"Summary file {summary_path} already exists. Skipping.")
|
| 58 |
-
return
|
| 59 |
-
if not obtain_summary and Path(ml_path).exists():
|
| 60 |
-
print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
|
| 61 |
-
return
|
| 62 |
-
|
| 63 |
-
if obtain_summary:
|
| 64 |
-
gdf = {'categories': [], 'lv_date': []} # global data
|
| 65 |
-
|
| 66 |
-
if authors_of_interest:
|
| 67 |
-
authors_of_interest = set(authors_of_interest)
|
| 68 |
-
|
| 69 |
-
# Load the JSON file line by line
|
| 70 |
-
with open(arxiv_path, 'r') as f1, open(ml_path, 'w') as f2:
|
| 71 |
-
for line in tqdm(f1):
|
| 72 |
-
# Parse each line as JSON
|
| 73 |
-
try:
|
| 74 |
-
entry_data = json.loads(line)
|
| 75 |
-
except json.JSONDecodeError:
|
| 76 |
-
# Skip lines that cannot be parsed as JSON
|
| 77 |
-
continue
|
| 78 |
-
|
| 79 |
-
# check categories and last version in entry data
|
| 80 |
-
if (
|
| 81 |
-
obtain_summary
|
| 82 |
-
and 'categories' in entry_data
|
| 83 |
-
and 'versions' in entry_data
|
| 84 |
-
and len(entry_data['versions'])
|
| 85 |
-
and 'created' in entry_data['versions'][-1]
|
| 86 |
-
):
|
| 87 |
-
gdf['categories'].append(entry_data['categories'])
|
| 88 |
-
gdf['lv_date'].append(entry_data['versions'][-1]['created'])
|
| 89 |
-
|
| 90 |
-
# ml data
|
| 91 |
-
authors_on_paper = get_lbl_from_name(entry_data['authors_parsed'])
|
| 92 |
-
if ('categories' in entry_data
|
| 93 |
-
and (any(cat in entry_data['categories'] for cat in ml_cats))
|
| 94 |
-
and (any(author in authors_of_interest for author in authors_on_paper))
|
| 95 |
-
):
|
| 96 |
-
f2.write(line)
|
| 97 |
-
|
| 98 |
-
if obtain_summary:
|
| 99 |
-
gdf = pd.DataFrame(gdf)
|
| 100 |
-
gdf['lv_date'] = pd.to_datetime(gdf['lv_date'])
|
| 101 |
-
gdf = gdf.sort_values('lv_date', axis=0).reset_index(drop=True)
|
| 102 |
-
|
| 103 |
-
cats = set()
|
| 104 |
-
for cat_combo in gdf['categories'].unique():
|
| 105 |
-
cat_combo.split(' ')
|
| 106 |
-
cats.update(cat_combo.split(' '))
|
| 107 |
-
print(f'Columnizing {len(cats)} categories. ')
|
| 108 |
-
for cat in tqdm(cats):
|
| 109 |
-
gdf[cat] = pd.arrays.SparseArray(gdf['categories'].str.contains(cat), fill_value=0, dtype=np.int8)
|
| 110 |
-
|
| 111 |
-
# count number of categories item is associated with
|
| 112 |
-
gdf['ncats'] = gdf['categories'].str.count(' ') + 1
|
| 113 |
-
|
| 114 |
-
# write to pickle file
|
| 115 |
-
with open(summary_path, 'wb') as f:
|
| 116 |
-
pickle.dump(gdf, f)
|
| 117 |
-
|
| 118 |
-
def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022, 10, 1)):
|
| 119 |
-
"""
|
| 120 |
-
Returns a dictionary mapping U.S. professor names to a list of indices
|
| 121 |
-
corresponding to their most recent papers in `data/arxiv-metadata-oai-snapshot-ml.json`.
|
| 122 |
-
This function is necessary to specify the papers we are interested in for each
|
| 123 |
-
professor (e.g., the most recent papers after cutoff)
|
| 124 |
-
|
| 125 |
-
Parameters:
|
| 126 |
-
- us_professors: A list of U.S. professor names to match against.
|
| 127 |
-
- k: The number of most recent papers to keep for each professor, based on
|
| 128 |
-
the first version upload date.
|
| 129 |
-
- cutoff (datetime): Only considers papers published after this date
|
| 130 |
-
(default: October 1, 2022).
|
| 131 |
-
|
| 132 |
-
Returns:
|
| 133 |
-
- dict: A dictionary where keys are professor names and values are lists of
|
| 134 |
-
indices corresponding to their most recent papers.
|
| 135 |
-
"""
|
| 136 |
-
# professors to tuple of (datetime, papers)
|
| 137 |
-
p2p = defaultdict(list)
|
| 138 |
-
|
| 139 |
-
with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
|
| 140 |
-
line_nbr = 1
|
| 141 |
-
while True:
|
| 142 |
-
line = f.readline()
|
| 143 |
-
if not line: break
|
| 144 |
-
|
| 145 |
-
try:
|
| 146 |
-
ml_data = json.loads(line)
|
| 147 |
-
paper_authors = get_lbl_from_name(ml_data['authors_parsed'])
|
| 148 |
-
|
| 149 |
-
# filter the same way as in `conference_scraper.py`
|
| 150 |
-
# ignore solo-authored papers and papers with more than 20 authors
|
| 151 |
-
if len(paper_authors) == 1 or len(paper_authors) > 20:
|
| 152 |
-
continue
|
| 153 |
-
|
| 154 |
-
try:
|
| 155 |
-
dt = datetime.strptime(ml_data["versions"][0]["created"], '%a, %d %b %Y %H:%M:%S %Z')
|
| 156 |
-
if dt < cutoff:
|
| 157 |
-
continue
|
| 158 |
-
except (KeyError, ValueError) as e:
|
| 159 |
-
print(f"Failed to parse date: {ml_data}")
|
| 160 |
-
dt = datetime(2000, 1, 1) # before cutoff date
|
| 161 |
-
|
| 162 |
-
# consider if professor is first-author since we now care about semantics
|
| 163 |
-
for paper_author in paper_authors:
|
| 164 |
-
if paper_author in us_professors:
|
| 165 |
-
# make a connection
|
| 166 |
-
heapq.heappush(p2p[paper_author], (dt, line_nbr))
|
| 167 |
-
if len(p2p[paper_author]) > k:
|
| 168 |
-
heapq.heappop(p2p[paper_author])
|
| 169 |
-
except:
|
| 170 |
-
print(f"{line}")
|
| 171 |
-
line_nbr += 1
|
| 172 |
-
return p2p
|
| 173 |
-
|
| 174 |
-
def gen(p2p):
|
| 175 |
-
values = p2p.values()
|
| 176 |
-
relevant_lines = set()
|
| 177 |
-
for value in values:
|
| 178 |
-
relevant_lines.update([v[1] for v in value])
|
| 179 |
-
relevant_lines = sorted(list(relevant_lines))
|
| 180 |
-
|
| 181 |
-
idx = 0
|
| 182 |
-
with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
|
| 183 |
-
line_nbr = 1
|
| 184 |
-
while idx < len(relevant_lines):
|
| 185 |
-
line = f.readline()
|
| 186 |
-
if not line: break
|
| 187 |
-
|
| 188 |
-
if line_nbr == relevant_lines[idx]:
|
| 189 |
-
data = json.loads(line)
|
| 190 |
-
yield {"line_nbr": line_nbr,
|
| 191 |
-
"id": data["id"],
|
| 192 |
-
"title": data["title"],
|
| 193 |
-
"abstract": data["abstract"],
|
| 194 |
-
"authors": data["authors_parsed"]}
|
| 195 |
-
idx += 1
|
| 196 |
-
|
| 197 |
-
line_nbr += 1
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
class EmbeddingProcessor:
|
| 201 |
-
def __init__(self, model_name: str, custom_model_name: str, device: str = "cuda"):
|
| 202 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 203 |
-
self.model = AutoModel.from_pretrained(custom_model_name)
|
| 204 |
-
self.device = torch.device(device)
|
| 205 |
-
self.model.to(self.device)
|
| 206 |
-
torch.cuda.empty_cache()
|
| 207 |
-
|
| 208 |
-
@staticmethod
|
| 209 |
-
def mean_pooling(model_output, attention_mask):
|
| 210 |
-
# First element of model_output contains all token embeddings
|
| 211 |
-
token_embeddings = model_output[0]
|
| 212 |
-
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 213 |
-
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 214 |
-
|
| 215 |
-
def get_embeddings(self, batch):
|
| 216 |
-
title_tkn, abstract_tkn = " [TITLE] ", " [ABSTRACT] "
|
| 217 |
-
titles = batch["title"]
|
| 218 |
-
abstracts = batch["abstract"]
|
| 219 |
-
|
| 220 |
-
texts = [title_tkn + t + abstract_tkn + a for t, a in zip(titles, abstracts)]
|
| 221 |
-
|
| 222 |
-
# Tokenize sentences
|
| 223 |
-
encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
| 224 |
-
encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
|
| 225 |
-
|
| 226 |
-
# Compute token embeddings
|
| 227 |
-
with torch.no_grad():
|
| 228 |
-
model_output = self.model(**encoded_input)
|
| 229 |
-
|
| 230 |
-
# Perform pooling
|
| 231 |
-
embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
|
| 232 |
-
|
| 233 |
-
# Normalize embeddings
|
| 234 |
-
embeddings = F.normalize(embeddings, p=2, dim=1)
|
| 235 |
-
|
| 236 |
-
# Move embeddings to CPU and convert to list
|
| 237 |
-
return embeddings.cpu().numpy().tolist()
|
| 238 |
-
|
| 239 |
-
def process_dataset(self, dataset_path: str, save_path: str, batch_size: int = 128):
|
| 240 |
-
# Load dataset
|
| 241 |
-
ds = Dataset.load_from_disk(dataset_path)
|
| 242 |
-
|
| 243 |
-
# Compute embeddings and add as a new column
|
| 244 |
-
ds_with_embeddings = ds.map(lambda x: {"embeddings": self.get_embeddings(x)}, batched=True, batch_size=batch_size)
|
| 245 |
-
|
| 246 |
-
# Save the updated dataset
|
| 247 |
-
save_path = save_path
|
| 248 |
-
ds_with_embeddings.save_to_disk(save_path)
|
| 249 |
-
print(f"Dataset with embeddings saved to {save_path}")
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
def main():
|
| 253 |
-
"""Downloads arxiv data and extract embeddings for papers."""
|
| 254 |
-
print("Downloading data...")
|
| 255 |
-
arxiv_path = download_arxiv_data()
|
| 256 |
-
with open('data/professor/us_professor.json', 'r') as f:
|
| 257 |
-
authors_of_interest = json.load(f)
|
| 258 |
-
authors_of_interest = [author['name'] for author in authors_of_interest]
|
| 259 |
-
print("Filtering data for ML papers...")
|
| 260 |
-
filter_arxiv_for_ml(arxiv_path, authors_of_interest=authors_of_interest)
|
| 261 |
-
|
| 262 |
-
# professor to list of paper indices
|
| 263 |
-
paper_data_path = "data/paper_embeddings/paper_data"
|
| 264 |
-
print("Saving data to disk at " + paper_data_path)
|
| 265 |
-
p2p = get_professors_and_relevant_papers(authors_of_interest)
|
| 266 |
-
ds = Dataset.from_generator(partial(gen, p2p))
|
| 267 |
-
ds.save_to_disk(paper_data_path)
|
| 268 |
-
|
| 269 |
-
print("Extracting embeddings (use GPU if possible)...")
|
| 270 |
-
# paper embeddings
|
| 271 |
-
save_path = "data/paper_embeddings/all-mpnet-base-v2-embds"
|
| 272 |
-
# Initialize the embedding processor with model names
|
| 273 |
-
embedding_processor = EmbeddingProcessor(
|
| 274 |
-
model_name='sentence-transformers/all-mpnet-base-v2',
|
| 275 |
-
custom_model_name='salsabiilashifa11/sbert-paper'
|
| 276 |
-
)
|
| 277 |
-
# Process dataset and save with embeddings
|
| 278 |
-
embedding_processor.process_dataset(paper_data_path, save_path, batch_size=128)
|
| 279 |
-
|
| 280 |
-
if __name__ == "__main__":
|
| 281 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_pipeline/requirements.txt
CHANGED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beautifulsoup4==4.12.3
|
| 2 |
+
datasets==3.0.1
|
| 3 |
+
kaggle==1.6.17
|
| 4 |
+
langchain==0.3.4
|
| 5 |
+
langchain_core==0.3.12
|
| 6 |
+
langchain_together==0.2.0
|
| 7 |
+
numpy
|
| 8 |
+
openai==1.52.0
|
| 9 |
+
pandas
|
| 10 |
+
python-dotenv==1.0.1
|
| 11 |
+
regex==2024.9.11
|
| 12 |
+
Requests==2.32.3
|
| 13 |
+
torch
|
| 14 |
+
tqdm==4.66.4
|
| 15 |
+
transformers==4.45.2
|
data_pipeline/us_professor_verifier.py
CHANGED
|
@@ -452,9 +452,26 @@ def batch_process_llm_output(client, batches):
|
|
| 452 |
|
| 453 |
with open("data/professor/us_professor.json", 'w') as file:
|
| 454 |
json.dump(us_professor_profiles, file, indent=4)
|
|
|
|
| 455 |
with open("data/professor/not_us_professor.json", 'w') as file:
|
| 456 |
json.dump(not_us_professor_profiles, file, indent=4)
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
def main():
|
| 459 |
import argparse
|
| 460 |
|
|
@@ -505,6 +522,7 @@ def main():
|
|
| 505 |
with open(f"{prompt_data_path_prefix}_batches.pkl", "rb") as f:
|
| 506 |
batches = pickle.load(f)
|
| 507 |
batch_process_llm_output(client, batches)
|
|
|
|
| 508 |
else:
|
| 509 |
raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")
|
| 510 |
|
|
|
|
| 452 |
|
| 453 |
with open("data/professor/us_professor.json", 'w') as file:
|
| 454 |
json.dump(us_professor_profiles, file, indent=4)
|
| 455 |
+
|
| 456 |
with open("data/professor/not_us_professor.json", 'w') as file:
|
| 457 |
json.dump(not_us_professor_profiles, file, indent=4)
|
| 458 |
|
| 459 |
+
def create_frontend_data(us_professor_profiles_path="data/professor/us_professor.json"):
|
| 460 |
+
with open(us_professor_profiles_path, 'r') as file:
|
| 461 |
+
us_professor_profiles = json.load(file)
|
| 462 |
+
|
| 463 |
+
professors_dict = {
|
| 464 |
+
professor['name']: {
|
| 465 |
+
'title': professor['title'],
|
| 466 |
+
'department': professor['department'],
|
| 467 |
+
'university': professor['university']
|
| 468 |
+
}
|
| 469 |
+
for professor in us_professor_profiles
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
with open("data/frontend_data/us_professor.json", 'w') as file:
|
| 473 |
+
json.dump(professors_dict, file)
|
| 474 |
+
|
| 475 |
def main():
|
| 476 |
import argparse
|
| 477 |
|
|
|
|
| 522 |
with open(f"{prompt_data_path_prefix}_batches.pkl", "rb") as f:
|
| 523 |
batches = pickle.load(f)
|
| 524 |
batch_process_llm_output(client, batches)
|
| 525 |
+
create_frontend_data()
|
| 526 |
else:
|
| 527 |
raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")
|
| 528 |
|
requirements.txt
CHANGED
|
@@ -1,12 +1 @@
|
|
| 1 |
-
|
| 2 |
-
openai
|
| 3 |
-
langchain-together
|
| 4 |
-
lxml
|
| 5 |
-
|
| 6 |
-
einops
|
| 7 |
-
torch and everything else
|
| 8 |
-
datasets
|
| 9 |
-
transformers
|
| 10 |
-
|
| 11 |
-
datasets
|
| 12 |
-
transformers
|
|
|
|
| 1 |
+
streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|