Spaces:

livctr
/

USMLPhDRecommender

Sleeping

App Files Files Community

livctr commited on Oct 20, 2024

Commit

6c2a7c2

0 Parent(s):

first commit

Browse files

Files changed (10) hide show

.gitignore +170 -0
README.md +48 -0
data_pipeline/__init__.py +0 -0
data_pipeline/conference_scraper.py +262 -0
data_pipeline/download_arxiv_kaggle.py +281 -0
data_pipeline/loaders.py +22 -0
data_pipeline/requirements.txt +0 -0
data_pipeline/schools_scraper.py +196 -0
data_pipeline/us_professor_verifier.py +513 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,170 @@

+.env
+data/*
+runs/*
+logs/*
+nbs/*
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# U.S. ML PhD Recomendation System
+Disclaimer: results are not 100% accurate and there is likely some bias to how papers / professors are filtered.
+### Data Pipeline
+First, a list of authors are gathered from recent conference proceedings. A batched RAG pipeline is used to determine which persons are U.S. professors (unsure how accurate the LLM here is). This can be reproduced as follows:
+#### Repeat research until satisfactory
+```python
+# Scrape top conferences for potential U.S.-based professors, ~45 mins
+python -m data_pipeline.conference_scraper
+```
+**Selected conferences**
+- NeurIPS: 2022, 2023
+- ICML: 2023, 2024
+- AISTATS: 2023, 2024
+- COLT: 2023, 2024
+- AAAI: 2023, 2024
+- EMNLP: 2023, 2024
+- CVPR: 2023, 2024
+```python
+# Search authors and locally store search results. Uses Bing web search API.
+python -m data_pipeline.us_professor_verifier --batch_search
+```
+NOTE 1: you may encounter caught exceptions due to HTTPError or invalid JSON outputs from the LLM. Would suggest to run the above multiple times until results are satisfactory.
+NOTE 2: This pipeline does not handle name collisions, name changes, initials.
+#### Create file containing U.S. professor data
+```python
+# Use locally stored search results as input to an LLM.
+# Sends as batches, each one waiting for the previous to finish.
+python -m data_pipeline.us_professor_verifier --batch_analyze
+# After some time (at most 24 hrs per batch, ~5 batches), the batch results become available for retrieval.
+# Took ~1 hr for me
+python -m data_pipeline.us_professor_verifier --batch_retrieve
+```
+#### Extract embeddings for the relevant papers
+```python
+# Fetch arxiv data and extract embeddings
+python -m data_pipeline.download_arxiv_kaggle
+```

data_pipeline/__init__.py ADDED Viewed

File without changes

data_pipeline/conference_scraper.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""Scrape data from some famous ML conferences and saves into data/conference.
+Every scrape function returns a list of 3-lists of the form
+    [paper_title, paper_authors, paper_url].
+Conferences
+-----------
+NeurIPS: 2022, 2023
+ICML: 2023, 2024
+AISTATS: 2023, 2024
+COLT: 2023, 2024
+AAAI: 2023, 2024
+EMNLP: 2023, 2024
+CVPR: 2023, 2024
+-----------
+Disclaimer
+-----------
+The choice of conferences was sourced from here:
+https://www.kaggle.com/discussions/getting-started/115799
+The priority of including certain conferences and tracks was based on a 1st-year PhD's
+judgment. Some very top conferences were excluded due to higher activation energy to
+scrape data and/or the ignorance of the 1st-year PhD. Some notable exceptions include
+ICLR, ICCV, ECCV, ACL, NAACL, and many others.
+-----------
+"""
+from collections import defaultdict
+from functools import partial
+import json
+import os
+import requests
+import time
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+SAVE_DIR = "data/conference"
+def scrape_nips(year):
+    nips_url = f"https://papers.nips.cc/paper/{year}"
+    response = requests.get(nips_url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    conference_items = soup.find_all('li')
+    conference_items = [[ci.a.get_text(), ci.i.get_text(), ci.a['href']] for ci in conference_items]
+    conference_items = [ci for ci in conference_items if ci[0]!="" and ci[1]!=""]
+    return conference_items
+def scrape_mlr_proceedings(conference, year):
+    cy2v = {
+        ("ICML", 2024): "v235",
+        ("ICML", 2023): "v202",
+        ("AISTATS", 2024): "v238",
+        ("AISTATS", 2023): "v206",
+        ("COLT", 2024): "v247",
+        ("COLT", 2023): "v195",
+    }
+    conference_url = f"https://proceedings.mlr.press/{cy2v[(conference, year)]}"
+    response = requests.get(conference_url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    conference_items = soup.find_all('div', class_="paper")
+    conference_items = [
+        [
+            ci.find('p', class_="title").get_text(),
+            ci.find('p', class_="details").find('span', class_="authors").get_text(),
+            ci.find('p', class_="links").find('a')['href']
+        ]
+        for ci in conference_items
+    ]
+    return conference_items
+def scrape_aaai():
+    # Scrape the technical tracks of past two years ('23, '24)
+    # Look at first two pages of archives that give links to tracks
+    # Look at each track
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+    }
+    # First two pages
+    track_links = []
+    aaai_urls = [
+        "https://ojs.aaai.org/index.php/AAAI/issue/archive",
+        "https://ojs.aaai.org/index.php/AAAI/issue/archive/2",
+    ]
+    for aaai_url in aaai_urls:
+        response = requests.get(aaai_url, headers=headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        tracks = [track.find('a', class_="title") for track in soup.find_all('h2')]
+        track_links.extend(
+            [(track.text.strip(), track['href']) for track in tracks if track is not None]
+        )
+        print(track_links)
+        time.sleep(60)  # respect scraping limits
+    # only look at past two years
+    track_links = [track_link for track_link in track_links if "AAAI-24" in track_link[0] or "AAAI-23" in track_link[0]]
+    print("track links: ", track_links)
+    conference_items = []
+    for track_link in tqdm(track_links):
+        print(f"Going through track {track_link[0]} @ {track_link[1]} ")
+        # Scrape tracks
+        response = requests.get(track_link[1], headers=headers)
+        soup = BeautifulSoup(response.text, "html.parser")
+        articles = soup.find_all('div', class_="obj_article_summary")
+        for article in articles:
+            aref = article.find('a')
+            conference_items.append(
+                [
+                    aref.text.strip(),
+                    article.find('div', class_="authors").text.strip(),
+                    aref['href'],
+                ]
+            )
+        time.sleep(60)  # respect scraping limits
+    return conference_items
+def scrape_emnlp(year):
+    emnlp_url = f"https://{year}.emnlp.org/program/accepted_main_conference/"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
+    }
+    response = requests.get(emnlp_url, headers=headers)
+    soup = BeautifulSoup(response.text, "html.parser")
+    ps = soup.find_all('p')
+    conference_items = [[p.contents[0].text, p.contents[-1].text, ''] for p in ps]
+    return conference_items
+def scrape_cvpr(year):
+    cvpr_url = f"https://openaccess.thecvf.com/CVPR{year}?day=all"
+    response = requests.get(cvpr_url)
+    soup = BeautifulSoup(response.text, "html.parser")
+    # Separately extract title/link and authors
+    dts = soup.find_all('dt', class_="ptitle")
+    conference_items = [(dt.text, '', dt.a['href']) for dt in dts]
+    dds = soup.find_all('dd')
+    authors = []
+    for dd in dds:
+        if dd.find('form') is not None:  # author entry
+            authors.append(
+                ', '.join([x.text for x in dd.find_all('a')])
+            )
+    conference_items = [[dt.text, author, dt.a['href']] for dt, author in zip(dts, authors)]
+    return conference_items
+def save_to_file(conference_items, filename):
+    with open(filename, 'w') as f:
+        for item in conference_items:
+            f.write(json.dumps(item) + '\n')
+def load_from_file(filename):
+    with open(filename, 'r') as f:
+        conference_items = [json.loads(line) for line in f]
+        return conference_items
+def main():
+    scrape_functions = {
+        "NeurIPS-2022": partial(scrape_nips, 2022),
+        "NeurIPS-2023": partial(scrape_nips, 2023),
+        "ICML-2023": partial(scrape_mlr_proceedings, "ICML", 2023),
+        "ICML-2024": partial(scrape_mlr_proceedings, "ICML", 2024),
+        "AISTATS-2023": partial(scrape_mlr_proceedings, "AISTATS", 2023),
+        "AISTATS-2024": partial(scrape_mlr_proceedings, "AISTATS", 2024),
+        "COLT-2023": partial(scrape_mlr_proceedings, "COLT", 2023),
+        "COLT-2024": partial(scrape_mlr_proceedings, "COLT", 2024),
+        "AAAI": scrape_aaai,  # easier to scrape both years at once, takes ~40 mins
+        "EMNLP-2023": partial(scrape_emnlp, 2023),
+        "EMNLP-2024": partial(scrape_emnlp, 2024),
+        "CVPR-2023": partial(scrape_cvpr, 2023),
+        "CVPR-2024": partial(scrape_cvpr, 2024),
+    }
+    def load_progress():
+        if os.path.exists(SAVE_DIR):
+            file_paths = os.listdir(SAVE_DIR)
+            file_paths = [file_path for file_path in file_paths if file_path.endswith('.json')]
+            file_paths = [file_path.split('.')[0] for file_path in file_paths]
+            return set(file_paths)
+        return set()
+    def save_progress(conference, file_path):
+        with open(file_path, 'a') as f:
+            f.write(conference + '\n')
+    def log_progress(msg, conference, file_path):
+        with open(file_path, 'a') as f:
+            f.write(conference + ': ' + msg + '\n')
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    # Load previous progress
+    scraped_conferences = load_progress()
+    # Progress file for current scrape
+    progress_file = "conference_scraper_progress.tmp"
+    for conference, scrape_function in tqdm(scrape_functions.items()):
+        if conference in scraped_conferences:
+            print(f"Skipping {conference}, already scraped.")
+            log_progress("Success!", conference, progress_file)
+            continue
+        try:
+            print(f"Scraping {conference}")
+            save_path = os.path.join(SAVE_DIR, f"{conference}.json")
+            conference_items = scrape_function()
+            save_to_file(conference_items, save_path)
+            print(f"Saved {conference} to {str(save_path)}")
+            save_progress(conference, progress_file)
+            log_progress("Success!", conference, progress_file)
+        except Exception as e:
+            print(f"Error scraping {conference}: {e}")
+            log_progress(f"ERROR: {e}", conference, progress_file)
+            continue
+    # Remove progress file
+    os.remove(progress_file)
+def stats():
+    total = 0
+    for fname in os.listdir(SAVE_DIR):
+        with open(os.path.join(SAVE_DIR, fname), 'r') as file:
+            num_lines = sum(1 for _ in file)
+            print(fname + ": " + str(num_lines) + " lines")
+            total += num_lines
+    print("Total: " + str(total))
+if __name__ == "__main__":
+    main()
+    stats()

data_pipeline/download_arxiv_kaggle.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""Pulls papers from arxiv."""
+from collections import defaultdict
+from functools import partial
+from datetime import datetime
+import heapq
+import json
+import os
+from pathlib import Path
+import pickle
+from datasets import Dataset
+import kaggle
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModel
+arxiv_fname = "arxiv-metadata-oai-snapshot.json"
+def download_arxiv_data(path = Path(".")):
+    """Downloads and unzips arxiv dataset from Kaggle into the `data` subdirectory of `path`."""
+    dataset = "Cornell-University/arxiv"
+    data_path = path/"data"
+    if not any([arxiv_fname in file for file in os.listdir(data_path)]):
+        kaggle.api.dataset_download_cli(dataset, path=data_path, unzip=True)
+    else:
+        print(f"Data already downloaded at {data_path/arxiv_fname}.")
+    return data_path/arxiv_fname
+def get_lbl_from_name(names):
+    """Tuple (last_name, first_name, middle_name) => String 'first_name [middle_name] last_name'."""
+    return [
+        name[1] + ' ' + name[0] if name[2] == '' \
+        else name[1] + ' ' + name[2] + ' ' + name[0]
+        for name in names
+    ]
+def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=None):
+    """Sifts through downloaded arxiv file to find ML-related papers.
+    If `obtain_summary` is True, saves a pickled DataFrame to the same directory as
+    the downloaded arxiv file with the name `arxiv_fname` + `-summary.pkl`.
+    If `authors_of_interest` is not None, only save ML-related papers by those authors.
+    """
+    ml_path = str(arxiv_path).split('.')[0]+'-ml.json'
+    summary_path = str(arxiv_path).split('.')[0]+'-summary.pkl'
+    ml_cats = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'stat.ML']
+    if obtain_summary and Path(ml_path).exists() and Path(summary_path).exists():
+        print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
+        print(f"Summary file {summary_path} already exists. Skipping.")
+        return
+    if not obtain_summary and Path(ml_path).exists():
+        print(f"File {ml_path} with ML subset of arxiv already exists. Skipping.")
+        return
+    if obtain_summary:
+        gdf = {'categories': [], 'lv_date': []}  # global data
+    if authors_of_interest:
+        authors_of_interest = set(authors_of_interest)
+    # Load the JSON file line by line
+    with open(arxiv_path, 'r') as f1, open(ml_path, 'w') as f2:
+        for line in tqdm(f1):
+            # Parse each line as JSON
+            try:
+                entry_data = json.loads(line)
+            except json.JSONDecodeError:
+                # Skip lines that cannot be parsed as JSON
+                continue
+            # check categories and last version in entry data
+            if (
+                obtain_summary
+                and 'categories' in entry_data
+                and 'versions' in entry_data
+                and len(entry_data['versions'])
+                and 'created' in entry_data['versions'][-1]
+            ):
+                gdf['categories'].append(entry_data['categories'])
+                gdf['lv_date'].append(entry_data['versions'][-1]['created'])
+            # ml data
+            authors_on_paper = get_lbl_from_name(entry_data['authors_parsed'])
+            if ('categories' in entry_data
+                and (any(cat in entry_data['categories'] for cat in ml_cats))
+                and (any(author in authors_of_interest for author in authors_on_paper))
+            ):
+                f2.write(line)
+    if obtain_summary:
+        gdf = pd.DataFrame(gdf)
+        gdf['lv_date'] = pd.to_datetime(gdf['lv_date'])
+        gdf = gdf.sort_values('lv_date', axis=0).reset_index(drop=True)
+        cats = set()
+        for cat_combo in gdf['categories'].unique():
+            cat_combo.split(' ')
+            cats.update(cat_combo.split(' '))
+        print(f'Columnizing {len(cats)} categories. ')
+        for cat in tqdm(cats):
+            gdf[cat] = pd.arrays.SparseArray(gdf['categories'].str.contains(cat), fill_value=0, dtype=np.int8)
+        # count number of categories item is associated with
+        gdf['ncats'] = gdf['categories'].str.count(' ') + 1
+        # write to pickle file
+        with open(summary_path, 'wb') as f:
+            pickle.dump(gdf, f)
+def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022, 10, 1)):
+    """
+    Returns a dictionary mapping U.S. professor names to a list of indices
+    corresponding to their most recent papers in `data/arxiv-metadata-oai-snapshot-ml.json`.
+    This function is necessary to specify the papers we are interested in for each
+    professor (e.g., the most recent papers after cutoff)
+    Parameters:
+    - us_professors: A list of U.S. professor names to match against.
+    - k: The number of most recent papers to keep for each professor, based on
+         the first version upload date.
+    - cutoff (datetime): Only considers papers published after this date
+                         (default: October 1, 2022).
+    Returns:
+    - dict: A dictionary where keys are professor names and values are lists of
+            indices corresponding to their most recent papers.
+    """
+    # professors to tuple of (datetime, papers)
+    p2p = defaultdict(list)
+    with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
+        line_nbr = 1
+        while True:
+            line = f.readline()
+            if not line: break
+            try:
+                ml_data = json.loads(line)
+                paper_authors = get_lbl_from_name(ml_data['authors_parsed'])
+                # filter the same way as in `conference_scraper.py`
+                # ignore solo-authored papers and papers with more than 20 authors
+                if len(paper_authors) == 1 or len(paper_authors) > 20:
+                    continue
+                try:
+                    dt = datetime.strptime(ml_data["versions"][0]["created"], '%a, %d %b %Y %H:%M:%S %Z')
+                    if dt < cutoff:
+                        continue
+                except (KeyError, ValueError) as e:
+                    print(f"Failed to parse date: {ml_data}")
+                    dt = datetime(2000, 1, 1)  # before cutoff date
+                # consider if professor is first-author since we now care about semantics
+                for paper_author in paper_authors:
+                    if paper_author in us_professors:
+                        # make a connection
+                        heapq.heappush(p2p[paper_author], (dt, line_nbr))
+                        if len(p2p[paper_author]) > k:
+                            heapq.heappop(p2p[paper_author])
+            except:
+                print(f"{line}")
+            line_nbr += 1
+    return p2p
+def gen(p2p):
+    values = p2p.values()
+    relevant_lines = set()
+    for value in values:
+        relevant_lines.update([v[1] for v in value])
+    relevant_lines = sorted(list(relevant_lines))
+    idx = 0
+    with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
+        line_nbr = 1
+        while idx < len(relevant_lines):
+            line = f.readline()
+            if not line: break
+            if line_nbr == relevant_lines[idx]:
+                data = json.loads(line)
+                yield {"line_nbr": line_nbr,
+                       "id": data["id"],
+                       "title": data["title"],
+                       "abstract": data["abstract"],
+                       "authors": data["authors_parsed"]}
+                idx += 1
+            line_nbr += 1
+class EmbeddingProcessor:
+    def __init__(self, model_name: str, custom_model_name: str, device: str = "cuda"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(custom_model_name)
+        self.device = torch.device(device)
+        self.model.to(self.device)
+        torch.cuda.empty_cache()
+    @staticmethod
+    def mean_pooling(model_output, attention_mask):
+        # First element of model_output contains all token embeddings
+        token_embeddings = model_output[0]
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    def get_embeddings(self, batch):
+        title_tkn, abstract_tkn = " [TITLE] ", " [ABSTRACT] "
+        titles = batch["title"]
+        abstracts = batch["abstract"]
+        texts = [title_tkn + t + abstract_tkn + a for t, a in zip(titles, abstracts)]
+        # Tokenize sentences
+        encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
+        encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        # Perform pooling
+        embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
+        # Normalize embeddings
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        # Move embeddings to CPU and convert to list
+        return embeddings.cpu().numpy().tolist()
+    def process_dataset(self, dataset_path: str, save_path: str, batch_size: int = 128):
+        # Load dataset
+        ds = Dataset.load_from_disk(dataset_path)
+        # Compute embeddings and add as a new column
+        ds_with_embeddings = ds.map(lambda x: {"embeddings": self.get_embeddings(x)}, batched=True, batch_size=batch_size)
+        # Save the updated dataset
+        save_path = save_path
+        ds_with_embeddings.save_to_disk(save_path)
+        print(f"Dataset with embeddings saved to {save_path}")
+def main():
+    """Downloads arxiv data and extract embeddings for papers."""
+    print("Downloading data...")
+    arxiv_path = download_arxiv_data()
+    with open('data/professor/us_professor.json', 'r') as f:
+        authors_of_interest = json.load(f)
+    authors_of_interest = [author['name'] for author in authors_of_interest]
+    print("Filtering data for ML papers...")
+    filter_arxiv_for_ml(arxiv_path, authors_of_interest=authors_of_interest)
+    # professor to list of paper indices
+    paper_data_path = "data/paper_embeddings/paper_data"
+    print("Saving data to disk at " + paper_data_path)
+    p2p = get_professors_and_relevant_papers(authors_of_interest)
+    ds = Dataset.from_generator(partial(gen, p2p))
+    ds.save_to_disk(paper_data_path)
+    print("Extracting embeddings (use GPU if possible)...")
+    # paper embeddings
+    save_path = "data/paper_embeddings/all-mpnet-base-v2-embds"
+    # Initialize the embedding processor with model names
+    embedding_processor = EmbeddingProcessor(
+        model_name='sentence-transformers/all-mpnet-base-v2',
+        custom_model_name='salsabiilashifa11/sbert-paper'
+    )
+    # Process dataset and save with embeddings
+    embedding_processor.process_dataset(paper_data_path, save_path, batch_size=128)
+if __name__ == "__main__":
+    main()

data_pipeline/loaders.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+import os
+def load_conference_papers(conference_dir='data/conference'):
+    papers = []
+    files = os.listdir(conference_dir)
+    for file in files:
+        if not file.endswith('.json'):
+            continue
+        with open(os.path.join(conference_dir, file), 'r') as f:
+            while True:
+                line = f.readline()
+                if not line: break
+                paper = json.loads(line)
+                papers.append(paper)
+    return papers
+def load_us_professor():
+    """Returns a JSON list"""
+    with open('data/professor/us_professor.json', 'r') as f:
+        us_professors = json.load(f)
+    return us_professors

data_pipeline/requirements.txt ADDED Viewed

File without changes

data_pipeline/schools_scraper.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# https://medium.com/@donadviser/running-selenium-and-chrome-on-wsl2-cfabe7db4bbb
+import os
+import time
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv, find_dotenv
+from langchain_together import ChatTogether
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnableLambda
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+_ = load_dotenv(find_dotenv()) # read local .env file
+def get_service_and_chrome_options():
+    """TODO: specific to chromedriver location."""
+    # Define Chrome options
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    # Add more options here if needed
+    # Define paths
+    user_home_dir = os.path.expanduser("~")
+    user_home_dir = os.path.expanduser("~")
+    chrome_binary_path = os.path.join(user_home_dir, "chrome-linux64", "chrome")
+    chromedriver_path = os.path.join(user_home_dir, "chromedriver-linux64", "chromedriver")
+    # Set binary location and service
+    chrome_options.binary_location = chrome_binary_path
+    service = Service(chromedriver_path)
+    return service, chrome_options
+def retrieve_csrankings_content(dump_file="soup.tmp"):
+    """Write times higher page to a dump file."""
+    # https://medium.com/@donadviser/running-selenium-and-chrome-on-wsl2-cfabe7db4bbb
+    # Using WSL2
+    service, chrome_options = get_service_and_chrome_options()
+    # Initialize Chrome WebDriver
+    with webdriver.Chrome(service=service, options=chrome_options) as browser:
+        print("Get browser")
+        browser.get("https://www.timeshighereducation.com/student/best-universities/best-universities-united-states")
+        # Wait for the page to load
+        print("Wait for the page to load")
+        browser.implicitly_wait(10)
+        print("Get html")
+        # Retrieve the HTML content
+        html_content = browser.page_source
+    # Write HTML content to soup.txt
+    with open(dump_file, "w") as f:
+        f.write(html_content)
+def extract_timeshigher_content(read_file="soup.tmp", dump_file="soup (1).tmp"):
+    """Extract universities from a dump file."""
+    with open(read_file, "r") as f:
+        html_content = f.read()
+    # Parse the HTML content
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Find universities
+    university_table = soup.find_all('tr')
+    universities = [tr.find('a').get_text() for tr in university_table if tr.find('a')]
+    # Remove duplicates while keeping the order
+    universities = list(dict.fromkeys(universities))
+    # Write universities line-by-line to a new file
+    with open(dump_file, "w") as f:
+        for uni in universities:
+            f.write(f"{uni}\n")
+def get_department_getter():
+    """
+    Returns a function that leverages LangChain and TogetherAI to get a list of
+    department names in a university associated with machine learning.
+    """
+    template_string = """\
+    You are an expert in PhD programs and know about \
+    specific departments at each university.\
+    You are helping to design a system that generates \
+    a list of professors that students interested in \
+    machine learning can apply to for their PhDs. \
+    Currently, recall is more important than precision. \
+    Include as many departments as possible, while \
+    maintaining relevancy. Which departments in {university} \
+    are associated with machine learning? Please format your \
+    answer as a numbered list. Afterwards, please generate a \
+    new line starting with \"Answer:\", followed by a concise \
+    list of department names generated, separated by
+    semicolons.\
+    """
+    prompt_template = ChatPromptTemplate.from_template(template_string)
+    # # choose from our 50+ models here: https://docs.together.ai/docs/inference-models
+    chat = ChatTogether(
+        together_api_key=os.environ["TOGETHER_API_KEY"],
+        model="meta-llama/Llama-3-70b-chat-hf",
+        temperature=0.3
+    )
+    output_parser = StrOutputParser()
+    def extract_function(text):
+        """Returns the line that starts with `Answer:`"""
+        if "Answer:" not in text:
+            return "No `Answer:` found"
+        return text.split("Answer:")[1].strip()
+    chain = prompt_template | chat | output_parser | RunnableLambda(extract_function)
+    def get_department_info(uni):
+        """Get department info from the university."""
+        return chain.invoke({"university": uni})
+    return get_department_info
+def get_department_info(unis_file="soup (1).tmp", deps_file="departments.tsv"):
+    """
+    Get department info for all universities in `unis_file` and
+    write it to `deps_file`."""
+    department_getter = get_department_getter()
+    with open(unis_file, "r") as fin, open(deps_file, "w") as fout:
+        # Iterate through universities in `fin`
+        for uni in fin.readlines():
+            uni = uni.strip()
+            deps = []
+            # Prompt the LLM multiple times for better recall
+            for i in range(3):
+                depstr = department_getter(uni)
+                time.sleep(3)  # Respect usage limits!
+                try:
+                    if depstr == "No `Answer:` found":
+                        print(f"No departments found for {uni} on {i}'th prompt.")
+                    else:
+                        deps_ = [d.strip() for d in depstr.split(';')]
+                        deps.extend(deps_)
+                except Exception as e:
+                    print("Exception for {uni} on {i}'th prompt: ")
+                    print("Parsing string: ", depstr)
+                    print(e)
+            # Deduplicate deps list
+            deps = list(dict.fromkeys(deps))
+            # Write to tsv dump file
+            for dep in deps:
+                fout.write(f"{uni}\t{dep}\n")
+            # Print string info
+            print(f"{uni}: {deps}")
+import requests
+def get_faculty_list_potential_links_getter():
+    """Returns a function that returns a list of links that may contain faculty lists."""
+    GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
+    GOOGLE_SEARCH_ENGINE_ID = os.environ['GOOGLE_SEARCH_ENGINE_ID']
+    def get_faculty_list_potential_links(uni, dep):
+        """Returns a list of links that may contain faculty lists."""
+        search_query = f'{uni} {dep} faculty list'
+    params = {
+        'q': search_query, 'key': GOOGLE_API_KEY, 'cx': GOOGLE_SEARCH_ENGINE_ID
+    }
+    response = requests.get('https://www.googleapis.com/customsearch/v1', params=params)
+    results = response.json()
+    title2link = {item['title']: item['link'] for item in results['items']}
+# if __name__ == "__main__":
+#     get_department_info()

data_pipeline/us_professor_verifier.py ADDED Viewed

	@@ -0,0 +1,513 @@

+from collections import defaultdict
+import json
+import os
+import pickle
+import requests
+import time
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv, find_dotenv
+from langchain.prompts import PromptTemplate
+from openai import OpenAI
+import regex as re
+from tqdm import tqdm
+from data_pipeline.conference_scraper import get_authors
+_ = load_dotenv(find_dotenv())
+SEARCH_URL = "https://api.bing.microsoft.com/v7.0/search"
+SUBSCRIPTION_KEY = os.environ["BING_SEARCH_API_KEY"]
+HEADERS = {
+        "Ocp-Apim-Subscription-Key": SUBSCRIPTION_KEY,
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+}
+EXAMPLE_PROFESSOR_JSON = {
+    "is_professor": True,
+    "title": "Assistant Professor",
+    "department": "Computer Science",
+    "university": "Stanford University",
+    "us_university": True,
+}
+EXAMPLE_not_professor_JSON = {
+    "is_professor": False,
+    "occupation": "Graduate Student",
+    "affiliation": "Carnegie Mellon University"
+}
+IS_PROFESSOR_TEMPLATE = """You are a helpful assistant tasked with determining if {person_name} is a machine learning \
+professor. You have search results from the query "{person_name} machine learning professor". \
+Based on the results, specify if {person_name} is a professor, and if so, provide \
+their title, department, university, and whether their university is in the U.S. If not, give their occupation \
+and affiliation. Note: multiple people may \
+share the same name, so choose the one most likely in machine learning. Further, one person may have multiple \
+positions. If this is the case and one of those positions include being a professor, specify they are a professor \
+and provide their title, department, university, and whether their university is in the U.S.
+Only return the raw JSON, no MarkDown!
+If {person_name} **is** a professor, fill out:
+- `is_professor`: true
+- `title`: e.g., `Assistant Professor`, `Associate Professor`, `Professor` etc.
+- `department`: Full name, e.g., `Computer Science` rather than `CS` and `Electrical Engineering` rather than `EE`.
+- `university`: Full name, e.g., `California Institute of Technology` rather than `Caltech`
+- `us_university`: `true` or `false`
+Example:
+{professor_json_template}
+If {person_name} **is not** a professor, fill out:
+- `is_professor`: false
+- `occupation`: e.g., `Graduate Student`, `Researcher`, `Engineer`, `Scientist`
+- `affiliation`: e.g., `Carnegie Mellon University`, `Deepmind`, `Apple`, `NVIDIA`
+Example:
+{not_professor_json_template}
+Search results (formatted as a numbered list with link name and snippet). \
+Again, only return the JSON, just with the dictionary and its fields.
+{hits}"""
+# import httpx
+def bing_search(person_name, max_retries=0, wait_time=0.5):
+    """Performs the bing search `person_name` machine learning professor."""
+    query = "{} machine learning professor".format(person_name)
+    params = {"q": query, "count": 10, "offset": 0, "mkt": "en-US", "textFormat": "HTML"}
+    for attempt in range(max_retries + 1):
+        try:
+            response = requests.get(SEARCH_URL, headers=HEADERS, params=params)
+            response.raise_for_status()
+            return response.json()
+        except requests.HTTPError as http_err:
+            if attempt == max_retries:
+                raise Exception(f"Max retries reached. Failed to get a valid response for {person_name}") from http_err
+            print(f"An error occurred while searching {person_name}: {http_err}. Retrying in {wait_time} seconds ...")
+            time.sleep(wait_time)
+    return ""  # doesn't run
+def process_search_results(search_results):
+    """Cleans up bing search results."""
+    # What people see, url name and snippet
+    readable_results = "\n".join(["{0}. [{1}]: [{2}]".format(i + 1, v["name"], v["snippet"])
+                                    for i, v in enumerate(search_results["webPages"]["value"])])
+    soup = BeautifulSoup(readable_results, "html.parser")
+    cleaned_readable_results = soup.get_text()
+    cleaned_readable_results = re.sub(r'[^\x00-\x7F]+', '', cleaned_readable_results)
+    # Links
+    url_results = "\n".join(["{0}. {1}".format(i + 1, v["url"])
+                                    for i, v in enumerate(search_results["webPages"]["value"])])
+    # Combine human readable and links
+    web_results = [cleaned_readable_results, url_results]
+    return web_results
+def get_prompt(person_name, top_hits):
+    template = PromptTemplate(
+        input_variables=["person_name", "professor_json_template", "not_professor_json_template", "hits"],
+        template=IS_PROFESSOR_TEMPLATE,
+    )
+    filled_prompt = template.format(person_name=person_name,
+                        professor_json_template=json.dumps(EXAMPLE_PROFESSOR_JSON),
+                        not_professor_json_template=json.dumps(EXAMPLE_not_professor_JSON),
+                        hits="\n".join(top_hits))
+    return filled_prompt
+def run_chatgpt(prompt, client, model="gpt-4o-mini", system_prompt=None):
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        temperature=0.0,
+        seed=123,
+    )
+    # Return response
+    return response.choices[0].message.content
+def check_json(profile):
+    if not isinstance(profile, dict):
+        raise ValueError("Profile must be a dictionary")
+    if "is_professor" not in profile:
+        raise ValueError("Profile must contain a 'is_professor' key")
+    if profile["is_professor"]:
+        if "title" not in profile:
+            raise ValueError("Profile must contain a 'title' key")
+        if "department" not in profile:
+            raise ValueError("Profile must contain a 'department' key")
+        if "university" not in profile:
+            raise ValueError("Profile must contain a 'university' key")
+        if "us_university" not in profile:
+            raise ValueError("Profile must contain a 'us_university' key")
+        if type(profile["us_university"]) is not bool:
+            raise ValueError("Profile 'us_university' must be a boolean")
+    else:
+        if "occupation" not in profile:
+            raise ValueError("Profile must contain an 'occupation' key")
+        if "affiliation" not in profile:
+            raise ValueError("Profile must contain an 'affiliation' key")
+def save_json(profiles, file_path):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, 'w') as file:  # appending just the new ones would be better
+        json.dump(profiles, file, indent=4)
+def load_json(file_path):
+    with open(file_path, 'r') as file:
+        return json.load(file)
+def log_progress_to_file(progress_log, file_path):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, 'a') as file:
+        file.write('\n'.join(progress_log))
+        file.write('\n' + '-' * 10 + '\n')
+def search_person(person_name, progress_log):
+    """Completes a bing search for the person."""
+    try:
+        search_results = bing_search(person_name)
+        web_results = process_search_results(search_results)
+        top_hits = web_results[0].split("\n")[:5]  # Extract top 5 results
+        progress_log.append(f"Success: Search results for {person_name}.")
+        return top_hits
+    except Exception as e:
+        print(f"Search exception for {person_name}: ", e)
+        progress_log.append(f"Failure: Search exception for {person_name}: {e}")
+        return ""
+def extract_search_results(person_name, progress_log, client, us_professor_profiles, not_us_professor_profiles, top_hits):
+    """Use LLM to extract data from search results."""
+    try:
+        prompt = get_prompt(person_name, top_hits)
+        gpt_output = run_chatgpt(prompt, client)  # LLM plz help
+        gpt_json = json.loads(gpt_output)
+        gpt_profile = {"name": person_name}
+        gpt_profile.update(gpt_json)
+        check_json(gpt_profile)
+        if gpt_profile["is_professor"] and gpt_profile["us_university"]:
+            us_professor_profiles.append(gpt_profile)
+        else:
+            not_us_professor_profiles.append(gpt_profile)
+    except Exception as e:
+        print(f"LLM exception for {person_name}: ", e)
+        progress_log.append(f"Failure: LLM exception for {person_name}: {e}")
+def research_person(person_name, client, progress_log, us_professor_profiles, not_us_professor_profiles):
+    """Research who this person is and save results."""
+    top_hits = search_person(person_name, progress_log)
+    if top_hits == "":
+        return
+    extract_search_results(person_name, progress_log, client, us_professor_profiles, not_us_professor_profiles, top_hits)
+def get_authors(save_dir="data/conference", min_papers=3, ignore_first_author=True):
+    """
+    Reduce the list of authors to those with at least `min_papers` papers for
+    which they are not first authors. Ignores solo-authored papers and papers
+    with more than 20 authors.
+    Filters authors so that we don't have to do RAG on every author, which is
+    monetarily expensive. Feel free to edit if you have more resources.
+    """
+    authors = defaultdict(int)
+    for fname in os.listdir(save_dir):
+        if not fname.endswith('.json'):
+            continue
+        with open(os.path.join(save_dir, fname), 'r') as file:
+            for line in file:
+                item = json.loads(line)
+                paper_authors = [x.strip() for x in item[1].split(",")]
+                # ignore solo-authored papers and papers with more than 20 authors
+                if len(paper_authors) == 1 or len(paper_authors) > 20:
+                    continue
+                # professors generally are not first authors
+                if not ignore_first_author and len(paper_authors) > 0:
+                    authors[paper_authors[0]] += 1
+                for i in range(1, len(paper_authors)):
+                    authors[paper_authors[i]] += 1
+    authors = {k: v for k, v in authors.items() if v >= min_papers}
+    os.makedirs(save_dir, exist_ok=True)
+    with open(os.path.join(save_dir, "authors.txt"), 'w') as f:
+        for k, v in authors.items():
+            f.write(f"{k}\t{v}\n")
+    return authors
+def research_conference_profiles(save_freq=20):
+    """Research each author as a stream.
+    NOTE: cannot deal w/ interrupts and continue from past progress.
+    """
+    authors = get_authors("data/conference")
+    person_names = list(authors.keys())
+    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+    progress_log = []
+    us_professor_profiles = []
+    not_us_professor_profiles = []
+    def log_save_print(progress_log, us_professor_profiles, not_us_professor_profiles, i):
+        log_progress_to_file(progress_log, 'logs/progress_log.tmp')
+        save_json(us_professor_profiles, 'data/professor/us_professor.json')
+        save_json(not_us_professor_profiles, 'data/professor/not_us_professor.json')
+        print(f"Saved profiles to data/professor/us_professor.json and data/professor/not_us_professor.json after processing {i} people")
+    for i in range(len(person_names)):
+        research_person(person_names[i], client, progress_log, us_professor_profiles, not_us_professor_profiles)
+        if i % save_freq == 0:
+            log_save_print(progress_log, us_professor_profiles, not_us_professor_profiles, i)
+    log_save_print(progress_log, us_professor_profiles, not_us_professor_profiles, i)
+    print("Research complete.")
+def batch_search_person(person_names, progress_log, save_freq=20):
+    """Searches everyone given in `person_names`."""
+    # might start and stop, pull from previous efforts
+    try:
+        prev_researched_authors = load_json("data/professor/search_results.json")
+    except:
+        prev_researched_authors = []
+    ignore_set = set([x[0] for x in prev_researched_authors])
+    data = prev_researched_authors
+    unseen_person_names = []
+    for person_name in person_names:
+        if person_name not in ignore_set:
+            unseen_person_names.append(person_name)
+    print(f"Already researched {len(ignore_set)} / {len(person_names)} = {len(ignore_set) / len(person_names)} of the dataset")
+    person_names = unseen_person_names
+    # continue search
+    for i in tqdm(range(len(person_names))):
+        if person_names[i] in ignore_set:
+            continue  # seen before
+        query_start = time.time()
+        top_hits = search_person(person_names[i], progress_log)
+        if top_hits != "":
+            data.append([person_names[i], top_hits])
+        if i % save_freq == 0:
+            save_json(data, "data/professor/search_results.json")
+            log_progress_to_file(progress_log, 'logs/progress_log.tmp')
+        # 3 queries per second max
+        wait_time = max(time.time() - (query_start + 0.334), 0.0)
+        time.sleep(wait_time)
+    save_json(data, "data/professor/search_results.json")
+    log_progress_to_file(progress_log, 'logs/progress_log.tmp')
+def write_batch_files(search_results_path,
+                      prompt_data_path_prefix,
+                      model="gpt-4o-mini",
+                      max_tokens=1000,
+                      temperature=0.0,
+                      seed=123,
+                      batch_size=1999,  # max_tokens * batch_size < 2M?
+                      verbose=0):
+    """Convert search results dump to jsonl for LLM batch request."""
+    with open(search_results_path, "r") as f:
+        search_results = json.load(f)
+    prompt_datas = []
+    for search_result in search_results:
+        prompt_data = {
+            "custom_id": f"request-{search_result[0]}",  # don't change, needed for decoding
+            "method": "POST",
+            "url": "/v1/chat/completions",
+            "body": {
+                "model": model,
+                "temperature": temperature,
+                "seed": seed,
+                "messages": [{"role": "user", "content": get_prompt(search_result[0], search_result[1])}],
+                "max_tokens": max_tokens
+            }
+        }
+        prompt_datas.append(prompt_data)
+    print(f"Number of prompts: {len(prompt_datas)}")
+    if verbose > 0:
+        print(get_prompt(search_result[0], search_result[1]))
+    batch_paths = []
+    for i in range(0, len(prompt_datas) // batch_size + 1):
+        prompt_data_path = f"{prompt_data_path_prefix}_{i}.jsonl"
+        batch_range = i * batch_size, (min(len(prompt_datas), (i + 1) * batch_size))
+        with open(prompt_data_path, "w") as f:
+            for prompt_data in prompt_datas[batch_range[0]:batch_range[1]]:
+                f.write(json.dumps(prompt_data) + "\n")
+        batch_paths.append(prompt_data_path)
+    return batch_paths
+def send_batch_files(prompt_data_path_prefix, batch_paths, client, timeout=24*60*60):
+    """Create and send the batch request to API endpoint."""
+    batches = []
+    print("Batching and sending requests...")
+    for batch_path in tqdm(batch_paths):
+        batch_input_file = client.files.create(
+            file=open(batch_path, "rb"),
+            purpose="batch"
+        )
+        batch_input_file_id = batch_input_file.id
+        print(f"Batch input file ID: {batch_input_file_id}")
+        batch = client.batches.create(
+            input_file_id=batch_input_file_id,
+            endpoint="/v1/chat/completions",
+            completion_window="24h",
+            metadata={
+            "description": "search extraction job"
+            }
+        )
+        begin = time.time()
+        while time.time() - begin < timeout:
+            batch = client.batches.retrieve(batch.id)
+            if batch.status == "completed":
+                break
+            time.sleep(40)
+            print(f"Status ({time.time()-begin:2f}): {batch.status}")
+            print("seconds elapsed: ", time.time() - begin)
+        batches.append(batch)
+    # Keeps track of the paths to the batch files
+    with open(f"{prompt_data_path_prefix}_batches.pkl", "wb") as f:
+        pickle.dump(batches, f)
+    with open(f"{prompt_data_path_prefix}_ids.txt", "w") as f:
+        f.write("\n".join([x.id for x in batches]))
+    return batches
+def retrieve_batch_output(client, batch_id):
+    """OpenAI batch requests finish within 24 hrs."""
+    retrieved_batch = client.batches.retrieve(batch_id)
+    if retrieved_batch.status == "completed":
+        return client.files.content(retrieved_batch.output_file_id).text
+    else:
+        print("Batch process is still in progress.")
+        print(retrieved_batch)
+        return "INCOMPLETE"
+def batch_process_llm_output(client, batches):
+    client = OpenAI()
+    outputs = []
+    for batch in batches:
+        batch_id = batch.id
+        output = retrieve_batch_output(client, batch_id)
+        if output == "INCOMPLETE":
+            return
+        outputs.append(output)
+    for output in outputs:
+        json_objects = output.split('\n')
+        custom_id_idx = len("request-")  # where the name begins in "custom_id"
+        progress_log = []
+        us_professor_profiles = []
+        not_us_professor_profiles = []
+        for json_obj in json_objects:
+            if json_obj == '': continue
+            try:
+                parsed_data = json.loads(json_obj)
+                message_content = parsed_data["response"]["body"]["choices"][0]["message"]["content"]
+                gpt_json = json.loads(message_content)
+                gpt_profile = {"name": parsed_data["custom_id"][custom_id_idx:]}
+                gpt_profile.update(gpt_json)
+                check_json(gpt_profile)
+                if gpt_profile["is_professor"] and gpt_profile["us_university"]:
+                    us_professor_profiles.append(gpt_profile)
+                else:
+                    not_us_professor_profiles.append(gpt_profile)
+                progress_log.append(f"Success: Parsed LLM output for {gpt_profile['name']}")
+            except Exception as e:
+                try:
+                    print(f"Failed to parse json object for custom-id `{parsed_data['custom_id']}`: {e}")
+                    progress_log.append(f"Failed: Parsed LLM output for {gpt_profile['name']}: {e}")
+                except Exception as e2:
+                    print(f"Failed to parse json object `{json_obj}`: {e2}")
+                    progress_log.append(f"Failed UNKNOWN: Parsed LLM output: {e2}")
+    with open("data/professor/us_professor.json", 'w') as file:
+        json.dump(us_professor_profiles, file, indent=4)
+    with open("data/professor/not_us_professor.json", 'w') as file:
+        json.dump(not_us_professor_profiles, file, indent=4)
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="US Professor Verifier: Search or LLM-Analyze batch operations."
+    )
+    # Add mutually exclusive group to ensure only one of the arguments is passed
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument(
+        '--batch_search',
+        action='store_true',
+        help='Batch search the authors.'
+    )
+    group.add_argument(
+        '--batch_analyze',
+        action='store_true',
+        help='Sends search results to LLM for analysis.'
+    )
+    group.add_argument(
+        '--batch_retrieve',
+        action='store_true',
+        help='Retrieve results from an LLM batch request, requires --batch_id'
+    )
+    parser.add_argument(
+        '--batch_ids_path',
+        type=str,
+        help='The batch ID for retrieval'
+    )
+    args = parser.parse_args()
+    prompt_data_path_prefix = "data/professor/prompt_data"
+    if args.batch_search:
+        authors = get_authors("data/conference")
+        authors_list = list(authors.keys())
+        print("Researching people...")
+        progress_log = []
+        batch_search_person(authors_list, progress_log, save_freq=20)
+    elif args.batch_analyze:
+        client = OpenAI()
+        batch_paths = write_batch_files("data/professor/search_results.json", prompt_data_path_prefix)
+        send_batch_files(prompt_data_path_prefix, batch_paths, client)
+    elif args.batch_retrieve:
+        client = OpenAI()
+        with open(f"{prompt_data_path_prefix}_batches.pkl", "rb") as f:
+            batches = pickle.load(f)
+        batch_process_llm_output(client, batches)
+    else:
+        raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+python-dotenv
+openai
+langchain-together
+lxml
+einops
+torch and everything else
+datasets
+transformers
+datasets
+transformers