Spaces:

livctr
/

USMLPhDRecommender

Sleeping

App Files Files Community

livctr commited on Oct 22, 2024

Commit

15c8d24

1 Parent(s): d5f5799

refactor data paths

Browse files

Files changed (8) hide show

streamlit.py → USMLPhDRecommender.py +0 -0
core/recommender.py +8 -12
data_pipeline/conference_scraper.py +8 -8
data_pipeline/config.py +45 -0
data_pipeline/loaders.py +0 -22
data_pipeline/paper_embeddings_extractor.py +33 -36
data_pipeline/schools_scraper.py +0 -196
data_pipeline/us_professor_verifier.py +30 -34

streamlit.py → USMLPhDRecommender.py RENAMED Viewed

File without changes

core/recommender.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from collections import Counter, defaultdict
 import json
-from operator import itemgetter
-from typing import List
 from datasets import Dataset
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
 class EmbeddingProcessor:
     def __init__(self,
@@ -63,23 +63,19 @@ class EmbeddingProcessor:
         ds_with_embeddings.save_to_disk(save_path)
         print(f"Dataset with embeddings saved to {save_path}")
-import os
 class Recommender:
     def __init__(self,
                  embedding_processor: EmbeddingProcessor,
-                 frontend_embds_path: str = "data/frontend_data/all-mpnet-base-v2-embds",
-                 frontend_id2professor_path: str = "data/frontend_data/arxiv_id2professor.json",
-                 frontend_us_professor_path: str = "data/frontend_data/us_professor.json",
     ):
         self.embedding_processor = embedding_processor
-        self.ita = Dataset.load_from_disk(os.path.join(frontend_embds_path, "id_title_author"))
-        self.embds = torch.load(os.path.join(frontend_embds_path, "weights.pt"), weights_only=True)
-        # with open(frontend_id2professor_path, 'r') as f:
-        #     self.id2professors = json.load(f)
         with open(frontend_us_professor_path, 'r') as f:
-            # dictionary with professor names as keys and their metadata as values
             self.us_professor_profiles = json.load(f)
     def get_top_k(self, query: str, top_k: int = 5):

 from collections import Counter, defaultdict
 import json
 from datasets import Dataset
 import torch
 import torch.nn.functional as F
 from transformers import AutoTokenizer, AutoModel
+from data_pipeline.config import DataPaths
 class EmbeddingProcessor:
     def __init__(self,
         ds_with_embeddings.save_to_disk(save_path)
         print(f"Dataset with embeddings saved to {save_path}")
 class Recommender:
     def __init__(self,
                  embedding_processor: EmbeddingProcessor,
+                 ita_path: str = DataPaths.FRONTEND_ITA_PATH,
+                 weights_path: str = DataPaths.FRONTEND_WEIGHTS_PATH,
+                 frontend_us_professor_path: str = DataPaths.FRONTEND_PROF_PATH,
     ):
         self.embedding_processor = embedding_processor
+        self.ita = Dataset.load_from_disk(ita_path)
+        self.embds = torch.load(weights_path, weights_only=True)
+        # dictionary with professor names as keys and their metadata as values
         with open(frontend_us_professor_path, 'r') as f:
             self.us_professor_profiles = json.load(f)
     def get_top_k(self, query: str, top_k: int = 5):

data_pipeline/conference_scraper.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Scrape data from some famous ML conferences and saves into data/conference.
 Every scrape function returns a list of 3-lists of the form
     [paper_title, paper_authors, paper_url].
@@ -36,8 +36,8 @@ import time
 from bs4 import BeautifulSoup
 from tqdm import tqdm
-SAVE_DIR = "data/conference"
 def scrape_nips(year):
     nips_url = f"https://papers.nips.cc/paper/{year}"
@@ -199,8 +199,8 @@ def main():
     }
     def load_progress():
-        if os.path.exists(SAVE_DIR):
-            file_paths = os.listdir(SAVE_DIR)
             file_paths = [file_path for file_path in file_paths if file_path.endswith('.json')]
             file_paths = [file_path.split('.')[0] for file_path in file_paths]
             return set(file_paths)
@@ -214,7 +214,7 @@ def main():
         with open(file_path, 'a') as f:
             f.write(conference + ': ' + msg + '\n')
-    os.makedirs(SAVE_DIR, exist_ok=True)
     # Load previous progress
     scraped_conferences = load_progress()
@@ -232,7 +232,7 @@ def main():
         try:
             print(f"Scraping {conference}")
-            save_path = os.path.join(SAVE_DIR, f"{conference}.json")
             conference_items = scrape_function()
             save_to_file(conference_items, save_path)
             print(f"Saved {conference} to {str(save_path)}")
@@ -249,8 +249,8 @@ def main():
 def stats():
     total = 0
-    for fname in os.listdir(SAVE_DIR):
-        with open(os.path.join(SAVE_DIR, fname), 'r') as file:
             num_lines = sum(1 for _ in file)
             print(fname + ": " + str(num_lines) + " lines")
             total += num_lines

+"""Scrape data from some famous ML conferences and saves into `DataPaths.CONFERENCE_DIR`.
 Every scrape function returns a list of 3-lists of the form
     [paper_title, paper_authors, paper_url].
 from bs4 import BeautifulSoup
 from tqdm import tqdm
+from data_pipeline.config import DataPaths
 def scrape_nips(year):
     nips_url = f"https://papers.nips.cc/paper/{year}"
     }
     def load_progress():
+        if os.path.exists(DataPaths.CONFERENCE_DIR):
+            file_paths = os.listdir(DataPaths.CONFERENCE_DIR)
             file_paths = [file_path for file_path in file_paths if file_path.endswith('.json')]
             file_paths = [file_path.split('.')[0] for file_path in file_paths]
             return set(file_paths)
         with open(file_path, 'a') as f:
             f.write(conference + ': ' + msg + '\n')
+    os.makedirs(DataPaths.CONFERENCE_DIR, exist_ok=True)
     # Load previous progress
     scraped_conferences = load_progress()
         try:
             print(f"Scraping {conference}")
+            save_path = os.path.join(DataPaths.CONFERENCE_DIR, f"{conference}.json")
             conference_items = scrape_function()
             save_to_file(conference_items, save_path)
             print(f"Saved {conference} to {str(save_path)}")
 def stats():
     total = 0
+    for fname in os.listdir(DataPaths.CONFERENCE_DIR):
+        with open(os.path.join(DataPaths.CONFERENCE_DIR, fname), 'r') as file:
             num_lines = sum(1 for _ in file)
             print(fname + ": " + str(num_lines) + " lines")
             total += num_lines

data_pipeline/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+class DataPaths:
+    BASE_DIR = "data"
+    LOG_DIR = "logs"
+    PROGRESS_LOG_PATH = os.path.join(LOG_DIR, 'progress_log.tmp')
+    CONFERENCE_DIR = os.path.join(BASE_DIR, 'conference')
+    AUTHORS_PATH = os.path.join(CONFERENCE_DIR, 'authors.txt')
+    PROF_DIR = os.path.join(BASE_DIR, 'professor')
+    SEARCH_RESULTS_PATH = os.path.join(PROF_DIR, 'search_results.json')
+    US_PROF_PATH = os.path.join(PROF_DIR, 'us_professor.json')
+    NOT_US_PROF_PATH = os.path.join(PROF_DIR, 'not_us_professor.json')
+    PROMPT_DATA_PREFIX = str(os.path.join(PROF_DIR, 'prompt_data'))
+    ARXIV_FNAME = 'arxiv-metadata-oai-snapshot.json'
+    ARXIV_PATH = os.path.join(BASE_DIR, ARXIV_FNAME)
+    ML_ARXIV_PATH = os.path.join(BASE_DIR, 'arxiv-metadata-oai-snapshot-ml.json')
+    PAPER_DIR = os.path.join(BASE_DIR, "paper_embeddings")
+    EMBD_MODEL = "all-mpnet-base-v2-embds"
+    EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
+    PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
+    FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
+    FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
+    FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL)  # contains id, title, author, weights
+    FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')
+    FRONTEND_WEIGHTS_PATH = os.path.join(FRONTEND_EMBD_PATH, 'weights.pt')
+    # create FRONTEND_DIR PROF_DIR CONFERENCE_DIR
+    @staticmethod
+    def ensure_directories():
+        # Create the directories if they do not exist
+        os.makedirs(DataPaths.RAW_DATA_DIR, exist_ok=True)
+        os.makedirs(DataPaths.PROCESSED_DATA_DIR, exist_ok=True)
+        os.makedirs(DataPaths.MODEL_OUTPUT_DIR, exist_ok=True)
+# Call this function early in your pipeline
+DataPaths.ensure_directories()

data_pipeline/loaders.py DELETED Viewed

@@ -1,22 +0,0 @@
-import json
-import os
-def load_conference_papers(conference_dir='data/conference'):
-    papers = []
-    files = os.listdir(conference_dir)
-    for file in files:
-        if not file.endswith('.json'):
-            continue
-        with open(os.path.join(conference_dir, file), 'r') as f:
-            while True:
-                line = f.readline()
-                if not line: break
-                paper = json.loads(line)
-                papers.append(paper)
-    return papers
-def load_us_professor():
-    """Returns a JSON list"""
-    with open('data/professor/us_professor.json', 'r') as f:
-        us_professors = json.load(f)
-    return us_professors

data_pipeline/paper_embeddings_extractor.py CHANGED Viewed

@@ -16,20 +16,20 @@ import torch
 from tqdm import tqdm
 from core.recommender import EmbeddingProcessor
 arxiv_fname = "arxiv-metadata-oai-snapshot.json"
-def download_arxiv_data(path = Path(".")):
-    """Downloads and unzips arxiv dataset from Kaggle into the `data` subdirectory of `path`."""
     dataset = "Cornell-University/arxiv"
-    data_path = path/"data"
-    if not any([arxiv_fname in file for file in os.listdir(data_path)]):
         kaggle.api.dataset_download_cli(dataset, path=data_path, unzip=True)
     else:
-        print(f"Data already downloaded at {data_path/arxiv_fname}.")
-    return data_path/arxiv_fname
 def get_lbl_from_name(names):
     """Tuple (last_name, first_name, middle_name) => String 'first_name [middle_name] last_name'."""
@@ -39,7 +39,7 @@ def get_lbl_from_name(names):
         for name in names
     ]
-def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=None):
     """Sifts through downloaded arxiv file to find ML-related papers.
     If `obtain_summary` is True, saves a pickled DataFrame to the same directory as
@@ -47,8 +47,8 @@ def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=No
     If `authors_of_interest` is not None, only save ML-related papers by those authors.
     """
-    ml_path = str(arxiv_path).split('.')[0]+'-ml.json'
-    summary_path = str(arxiv_path).split('.')[0]+'-summary.pkl'
     ml_cats = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'stat.ML']
@@ -67,7 +67,7 @@ def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=No
         authors_of_interest = set(authors_of_interest)
     # Load the JSON file line by line
-    with open(arxiv_path, 'r') as f1, open(ml_path, 'w') as f2:
         for line in tqdm(f1):
             # Parse each line as JSON
             try:
@@ -118,7 +118,7 @@ def filter_arxiv_for_ml(arxiv_path, obtain_summary=False, authors_of_interest=No
 def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022, 10, 1)):
     """
     Returns a dictionary mapping U.S. professor names to a list of indices
-    corresponding to their most recent papers in `data/arxiv-metadata-oai-snapshot-ml.json`.
     This function is necessary to specify the papers we are interested in for each
     professor (e.g., the most recent papers after cutoff)
@@ -136,7 +136,7 @@ def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022,
     # professors to tuple of (datetime, arxiv_id)
     p2p = defaultdict(list)
-    with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
         while True:
             line = f.readline()
             if not line: break
@@ -174,7 +174,7 @@ def gen(p2p):
     relevant_ids = set()
     for value in values:
         relevant_ids.update([v[1] for v in value])
-    with open('data/arxiv-metadata-oai-snapshot-ml.json', 'r') as f:
         while True:
             line = f.readline()
             if not line: break
@@ -193,7 +193,7 @@ def gen(p2p):
 def save_paper_to_professor(p2p, save_path):
     """Returns a dictionary mapping an Arxiv ID to U.S. professor names
-    `p2p`: mapping from professor to list of paper indices in `data/arxiv-metadata-oai-snapshot-ml.json`
     `ds`: dataset with Arxiv links and line_nbr
     """
@@ -215,42 +215,39 @@ def main():
     ### Download and filter for ML papers written by U.S. professors ###
     print("Downloading data...")
-    arxiv_path = download_arxiv_data()
-    with open('data/professor/us_professor.json', 'r') as f:
         authors_of_interest = json.load(f)
     authors_of_interest = [author['name'] for author in authors_of_interest]
     print("Filtering data for ML papers...")
-    filter_arxiv_for_ml(arxiv_path, authors_of_interest=authors_of_interest)
     ### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
-    paper_data_path = "data/paper_embeddings/paper_data"
-    print("Saving paper data to disk at " + paper_data_path)
     p2p = get_professors_and_relevant_papers(authors_of_interest)
     ds = Dataset.from_generator(partial(gen, p2p))
-    ds.save_to_disk(paper_data_path)
-    # ### Extract paper embeddings ###
-    # print("Extracting embeddings (use GPU if possible)...")
-    # # Initialize the embedding processor with model names
-    # embedding_processor = EmbeddingProcessor(
-    #     model_name='sentence-transformers/all-mpnet-base-v2',
-    #     custom_model_name='salsabiilashifa11/sbert-paper'
-    # )
-    # # Process dataset and save with embeddings
-    embds_save_path = "data/paper_embeddings/all-mpnet-base-v2-embds"
-    # embedding_processor.process_dataset(paper_data_path, embds_save_path, batch_size=128)
     ### Create front-end data ###
     # Filter ds for paper title, id, authors, and embedding
-    embds = Dataset.load_from_disk(embds_save_path)
-    embds_frontend_save_path = "data/frontend_data/all-mpnet-base-v2-embds"
     # save id and title to disk
-    embds.select_columns(['id', 'title', 'authors']).save_to_disk(os.path.join(embds_frontend_save_path, 'id_title_author'))
     # save embeddings as torch tensor
     embds_weights = torch.Tensor(embds['embeddings'])
-    torch.save(embds_weights, os.path.join(embds_frontend_save_path, 'weights.pt'))
 if __name__ == "__main__":
     main()

 from tqdm import tqdm
 from core.recommender import EmbeddingProcessor
+from data_pipeline.config import DataPaths
 arxiv_fname = "arxiv-metadata-oai-snapshot.json"
+def download_arxiv_data():
+    """Downloads and unzips arxiv dataset from Kaggle into `data` directory."""
     dataset = "Cornell-University/arxiv"
+    data_path = DataPaths.BASE_DIR
+    if not any([DataPaths.ARXIV_FNAME in file for file in os.listdir(data_path)]):
         kaggle.api.dataset_download_cli(dataset, path=data_path, unzip=True)
     else:
+        print(f"Data already downloaded at {DataPaths.ARXIV_FNAME}.")
+    return DataPaths.ARXIV_FNAME
 def get_lbl_from_name(names):
     """Tuple (last_name, first_name, middle_name) => String 'first_name [middle_name] last_name'."""
         for name in names
     ]
+def filter_arxiv_for_ml(obtain_summary=False, authors_of_interest=None):
     """Sifts through downloaded arxiv file to find ML-related papers.
     If `obtain_summary` is True, saves a pickled DataFrame to the same directory as
     If `authors_of_interest` is not None, only save ML-related papers by those authors.
     """
+    ml_path = str(DataPaths.ARXIV_PATH).split('.')[0]+'-ml.json'
+    summary_path = str(DataPaths.ARXIV_PATH).split('.')[0]+'-summary.pkl'
     ml_cats = ['cs.AI', 'cs.CL', 'cs.CV', 'cs.LG', 'stat.ML']
         authors_of_interest = set(authors_of_interest)
     # Load the JSON file line by line
+    with open(DataPaths.ARXIV_PATH, 'r') as f1, open(ml_path, 'w') as f2:
         for line in tqdm(f1):
             # Parse each line as JSON
             try:
 def get_professors_and_relevant_papers(us_professors, k=8, cutoff=datetime(2022, 10, 1)):
     """
     Returns a dictionary mapping U.S. professor names to a list of indices
+    corresponding to their most recent papers in DataPaths.ML_ARXIV_PATH.
     This function is necessary to specify the papers we are interested in for each
     professor (e.g., the most recent papers after cutoff)
     # professors to tuple of (datetime, arxiv_id)
     p2p = defaultdict(list)
+    with open(DataPaths.ML_ARXIV_PATH, 'r') as f:
         while True:
             line = f.readline()
             if not line: break
     relevant_ids = set()
     for value in values:
         relevant_ids.update([v[1] for v in value])
+    with open(DataPaths.ML_ARXIV_PATH, 'r') as f:
         while True:
             line = f.readline()
             if not line: break
 def save_paper_to_professor(p2p, save_path):
     """Returns a dictionary mapping an Arxiv ID to U.S. professor names
+    `p2p`: mapping from professor to list of paper indices in DataPaths.ML_ARXIV_PATH
     `ds`: dataset with Arxiv links and line_nbr
     """
     ### Download and filter for ML papers written by U.S. professors ###
     print("Downloading data...")
+    download_arxiv_data()
+    with open(DataPaths.US_PROF_PATH, 'r') as f:
         authors_of_interest = json.load(f)
     authors_of_interest = [author['name'] for author in authors_of_interest]
     print("Filtering data for ML papers...")
+    filter_arxiv_for_ml(authors_of_interest=authors_of_interest)
     ### Create a dataset containing paper info, e.g., title, abstract, authors, etc. ###
+    print("Saving paper data to disk at " + DataPaths.PAPER_DATA_PATH)
     p2p = get_professors_and_relevant_papers(authors_of_interest)
     ds = Dataset.from_generator(partial(gen, p2p))
+    ds.save_to_disk(DataPaths.PAPER_DATA_PATH)
+    ### Extract paper embeddings ###
+    print("Extracting embeddings (use GPU if possible)...")
+    # Initialize the embedding processor with model names
+    embedding_processor = EmbeddingProcessor(
+        model_name='sentence-transformers/all-mpnet-base-v2',
+        custom_model_name='salsabiilashifa11/sbert-paper'
+    )
+    # Process dataset and save with embeddings
+    embedding_processor.process_dataset(DataPaths.PAPER_DATA_PATH, DataPaths.EMBD_PATH, batch_size=128)
     ### Create front-end data ###
     # Filter ds for paper title, id, authors, and embedding
+    embds = Dataset.load_from_disk(DataPaths.EMBD_PATH)
     # save id and title to disk
+    embds.select_columns(['id', 'title', 'authors']).save_to_disk(DataPaths.FRONTEND_ITA_PATH)
     # save embeddings as torch tensor
     embds_weights = torch.Tensor(embds['embeddings'])
+    torch.save(embds_weights, DataPaths.FRONTEND_WEIGHTS_PATH)
 if __name__ == "__main__":
     main()

data_pipeline/schools_scraper.py DELETED Viewed

@@ -1,196 +0,0 @@
-# https://medium.com/@donadviser/running-selenium-and-chrome-on-wsl2-cfabe7db4bbb
-import os
-import time
-from bs4 import BeautifulSoup
-from dotenv import load_dotenv, find_dotenv
-from langchain_together import ChatTogether
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import RunnableLambda
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.chrome.options import Options
-_ = load_dotenv(find_dotenv()) # read local .env file
-def get_service_and_chrome_options():
-    """TODO: specific to chromedriver location."""
-    # Define Chrome options
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")
-    chrome_options.add_argument("--no-sandbox")
-    # Add more options here if needed
-    # Define paths
-    user_home_dir = os.path.expanduser("~")
-    user_home_dir = os.path.expanduser("~")
-    chrome_binary_path = os.path.join(user_home_dir, "chrome-linux64", "chrome")
-    chromedriver_path = os.path.join(user_home_dir, "chromedriver-linux64", "chromedriver")
-    # Set binary location and service
-    chrome_options.binary_location = chrome_binary_path
-    service = Service(chromedriver_path)
-    return service, chrome_options
-def retrieve_csrankings_content(dump_file="soup.tmp"):
-    """Write times higher page to a dump file."""
-    # https://medium.com/@donadviser/running-selenium-and-chrome-on-wsl2-cfabe7db4bbb
-    # Using WSL2
-    service, chrome_options = get_service_and_chrome_options()
-    # Initialize Chrome WebDriver
-    with webdriver.Chrome(service=service, options=chrome_options) as browser:
-        print("Get browser")
-        browser.get("https://www.timeshighereducation.com/student/best-universities/best-universities-united-states")
-        # Wait for the page to load
-        print("Wait for the page to load")
-        browser.implicitly_wait(10)
-        print("Get html")
-        # Retrieve the HTML content
-        html_content = browser.page_source
-    # Write HTML content to soup.txt
-    with open(dump_file, "w") as f:
-        f.write(html_content)
-def extract_timeshigher_content(read_file="soup.tmp", dump_file="soup (1).tmp"):
-    """Extract universities from a dump file."""
-    with open(read_file, "r") as f:
-        html_content = f.read()
-    # Parse the HTML content
-    soup = BeautifulSoup(html_content, "html.parser")
-    # Find universities
-    university_table = soup.find_all('tr')
-    universities = [tr.find('a').get_text() for tr in university_table if tr.find('a')]
-    # Remove duplicates while keeping the order
-    universities = list(dict.fromkeys(universities))
-    # Write universities line-by-line to a new file
-    with open(dump_file, "w") as f:
-        for uni in universities:
-            f.write(f"{uni}\n")
-def get_department_getter():
-    """
-    Returns a function that leverages LangChain and TogetherAI to get a list of
-    department names in a university associated with machine learning.
-    """
-    template_string = """\
-    You are an expert in PhD programs and know about \
-    specific departments at each university.\
-    You are helping to design a system that generates \
-    a list of professors that students interested in \
-    machine learning can apply to for their PhDs. \
-    Currently, recall is more important than precision. \
-    Include as many departments as possible, while \
-    maintaining relevancy. Which departments in {university} \
-    are associated with machine learning? Please format your \
-    answer as a numbered list. Afterwards, please generate a \
-    new line starting with \"Answer:\", followed by a concise \
-    list of department names generated, separated by
-    semicolons.\
-    """
-    prompt_template = ChatPromptTemplate.from_template(template_string)
-    # # choose from our 50+ models here: https://docs.together.ai/docs/inference-models
-    chat = ChatTogether(
-        together_api_key=os.environ["TOGETHER_API_KEY"],
-        model="meta-llama/Llama-3-70b-chat-hf",
-        temperature=0.3
-    )
-    output_parser = StrOutputParser()
-    def extract_function(text):
-        """Returns the line that starts with `Answer:`"""
-        if "Answer:" not in text:
-            return "No `Answer:` found"
-        return text.split("Answer:")[1].strip()
-    chain = prompt_template | chat | output_parser | RunnableLambda(extract_function)
-    def get_department_info(uni):
-        """Get department info from the university."""
-        return chain.invoke({"university": uni})
-    return get_department_info
-def get_department_info(unis_file="soup (1).tmp", deps_file="departments.tsv"):
-    """
-    Get department info for all universities in `unis_file` and
-    write it to `deps_file`."""
-    department_getter = get_department_getter()
-    with open(unis_file, "r") as fin, open(deps_file, "w") as fout:
-        # Iterate through universities in `fin`
-        for uni in fin.readlines():
-            uni = uni.strip()
-            deps = []
-            # Prompt the LLM multiple times for better recall
-            for i in range(3):
-                depstr = department_getter(uni)
-                time.sleep(3)  # Respect usage limits!
-                try:
-                    if depstr == "No `Answer:` found":
-                        print(f"No departments found for {uni} on {i}'th prompt.")
-                    else:
-                        deps_ = [d.strip() for d in depstr.split(';')]
-                        deps.extend(deps_)
-                except Exception as e:
-                    print("Exception for {uni} on {i}'th prompt: ")
-                    print("Parsing string: ", depstr)
-                    print(e)
-            # Deduplicate deps list
-            deps = list(dict.fromkeys(deps))
-            # Write to tsv dump file
-            for dep in deps:
-                fout.write(f"{uni}\t{dep}\n")
-            # Print string info
-            print(f"{uni}: {deps}")
-import requests
-def get_faculty_list_potential_links_getter():
-    """Returns a function that returns a list of links that may contain faculty lists."""
-    GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']
-    GOOGLE_SEARCH_ENGINE_ID = os.environ['GOOGLE_SEARCH_ENGINE_ID']
-    def get_faculty_list_potential_links(uni, dep):
-        """Returns a list of links that may contain faculty lists."""
-        search_query = f'{uni} {dep} faculty list'
-    params = {
-        'q': search_query, 'key': GOOGLE_API_KEY, 'cx': GOOGLE_SEARCH_ENGINE_ID
-    }
-    response = requests.get('https://www.googleapis.com/customsearch/v1', params=params)
-    results = response.json()
-    title2link = {item['title']: item['link'] for item in results['items']}
-# if __name__ == "__main__":
-#     get_department_info()

data_pipeline/us_professor_verifier.py CHANGED Viewed

@@ -13,6 +13,7 @@ import regex as re
 from tqdm import tqdm
 from data_pipeline.conference_scraper import get_authors
 _ = load_dotenv(find_dotenv())
@@ -161,7 +162,7 @@ def check_json(profile):
 def save_json(profiles, file_path):
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
-    with open(file_path, 'w') as file:  # appending just the new ones would be better
         json.dump(profiles, file, indent=4)
 def load_json(file_path):
@@ -212,7 +213,7 @@ def research_person(person_name, client, progress_log, us_professor_profiles, no
     extract_search_results(person_name, progress_log, client, us_professor_profiles, not_us_professor_profiles, top_hits)
-def get_authors(save_dir="data/conference", min_papers=3, ignore_first_author=True):
     """
     Reduce the list of authors to those with at least `min_papers` papers for
     which they are not first authors. Ignores solo-authored papers and papers
@@ -222,11 +223,11 @@ def get_authors(save_dir="data/conference", min_papers=3, ignore_first_author=Tr
     monetarily expensive. Feel free to edit if you have more resources.
     """
     authors = defaultdict(int)
-    for fname in os.listdir(save_dir):
         if not fname.endswith('.json'):
             continue
-        with open(os.path.join(save_dir, fname), 'r') as file:
             for line in file:
                 item = json.loads(line)
                 paper_authors = [x.strip() for x in item[1].split(",")]
@@ -242,8 +243,8 @@ def get_authors(save_dir="data/conference", min_papers=3, ignore_first_author=Tr
                     authors[paper_authors[i]] += 1
     authors = {k: v for k, v in authors.items() if v >= min_papers}
-    os.makedirs(save_dir, exist_ok=True)
-    with open(os.path.join(save_dir, "authors.txt"), 'w') as f:
         for k, v in authors.items():
             f.write(f"{k}\t{v}\n")
     return authors
@@ -254,7 +255,7 @@ def research_conference_profiles(save_freq=20):
     NOTE: cannot deal w/ interrupts and continue from past progress.
     """
-    authors = get_authors("data/conference")
     person_names = list(authors.keys())
     client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
@@ -264,10 +265,10 @@ def research_conference_profiles(save_freq=20):
     not_us_professor_profiles = []
     def log_save_print(progress_log, us_professor_profiles, not_us_professor_profiles, i):
-        log_progress_to_file(progress_log, 'logs/progress_log.tmp')
-        save_json(us_professor_profiles, 'data/professor/us_professor.json')
-        save_json(not_us_professor_profiles, 'data/professor/not_us_professor.json')
-        print(f"Saved profiles to data/professor/us_professor.json and data/professor/not_us_professor.json after processing {i} people")
     for i in range(len(person_names)):
         research_person(person_names[i], client, progress_log, us_professor_profiles, not_us_professor_profiles)
@@ -281,7 +282,7 @@ def batch_search_person(person_names, progress_log, save_freq=20):
     """Searches everyone given in `person_names`."""
     # might start and stop, pull from previous efforts
     try:
-        prev_researched_authors = load_json("data/professor/search_results.json")
     except:
         prev_researched_authors = []
     ignore_set = set([x[0] for x in prev_researched_authors])
@@ -304,18 +305,17 @@ def batch_search_person(person_names, progress_log, save_freq=20):
             data.append([person_names[i], top_hits])
         if i % save_freq == 0:
-            save_json(data, "data/professor/search_results.json")
-            log_progress_to_file(progress_log, 'logs/progress_log.tmp')
         # 3 queries per second max
         wait_time = max(time.time() - (query_start + 0.334), 0.0)
         time.sleep(wait_time)
-    save_json(data, "data/professor/search_results.json")
-    log_progress_to_file(progress_log, 'logs/progress_log.tmp')
 def write_batch_files(search_results_path,
-                      prompt_data_path_prefix,
                       model="gpt-4o-mini",
                       max_tokens=1000,
                       temperature=0.0,
@@ -348,7 +348,7 @@ def write_batch_files(search_results_path,
     batch_paths = []
     for i in range(0, len(prompt_datas) // batch_size + 1):
-        prompt_data_path = f"{prompt_data_path_prefix}_{i}.jsonl"
         batch_range = i * batch_size, (min(len(prompt_datas), (i + 1) * batch_size))
         with open(prompt_data_path, "w") as f:
             for prompt_data in prompt_datas[batch_range[0]:batch_range[1]]:
@@ -357,7 +357,7 @@ def write_batch_files(search_results_path,
     return batch_paths
-def send_batch_files(prompt_data_path_prefix, batch_paths, client, timeout=24*60*60):
     """Create and send the batch request to API endpoint."""
     batches = []
@@ -391,10 +391,8 @@ def send_batch_files(prompt_data_path_prefix, batch_paths, client, timeout=24*60
         batches.append(batch)
     # Keeps track of the paths to the batch files
-    with open(f"{prompt_data_path_prefix}_batches.pkl", "wb") as f:
         pickle.dump(batches, f)
-    with open(f"{prompt_data_path_prefix}_ids.txt", "w") as f:
-        f.write("\n".join([x.id for x in batches]))
     return batches
 def retrieve_batch_output(client, batch_id):
@@ -450,14 +448,14 @@ def batch_process_llm_output(client, batches):
                     print(f"Failed to parse json object `{json_obj}`: {e2}")
                     progress_log.append(f"Failed UNKNOWN: Parsed LLM output: {e2}")
-    with open("data/professor/us_professor.json", 'w') as file:
         json.dump(us_professor_profiles, file, indent=4)
-    with open("data/professor/not_us_professor.json", 'w') as file:
         json.dump(not_us_professor_profiles, file, indent=4)
-def create_frontend_data(us_professor_profiles_path="data/professor/us_professor.json"):
-    with open(us_professor_profiles_path, 'r') as file:
         us_professor_profiles = json.load(file)
     professors_dict = {
@@ -469,7 +467,7 @@ def create_frontend_data(us_professor_profiles_path="data/professor/us_professor
         for professor in us_professor_profiles
     }
-    with open("data/frontend_data/us_professor.json", 'w') as file:
         json.dump(professors_dict, file)
 def main():
@@ -505,24 +503,22 @@ def main():
     args = parser.parse_args()
-    prompt_data_path_prefix = "data/professor/prompt_data"
     if args.batch_search:
-        authors = get_authors("data/conference")
         authors_list = list(authors.keys())
         print("Researching people...")
         progress_log = []
         batch_search_person(authors_list, progress_log, save_freq=20)
     elif args.batch_analyze:
         client = OpenAI()
-        batch_paths = write_batch_files("data/professor/search_results.json", prompt_data_path_prefix)
-        send_batch_files(prompt_data_path_prefix, batch_paths, client)
     elif args.batch_retrieve:
         client = OpenAI()
-        with open(f"{prompt_data_path_prefix}_batches.pkl", "rb") as f:
             batches = pickle.load(f)
         batch_process_llm_output(client, batches)
-        create_frontend_data()
     else:
         raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")

 from tqdm import tqdm
 from data_pipeline.conference_scraper import get_authors
+from data_pipeline.config import DataPaths
 _ = load_dotenv(find_dotenv())
 def save_json(profiles, file_path):
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, 'w') as file:  # TODO: in the future use append mode
         json.dump(profiles, file, indent=4)
 def load_json(file_path):
     extract_search_results(person_name, progress_log, client, us_professor_profiles, not_us_professor_profiles, top_hits)
+def get_authors(min_papers=3, ignore_first_author=True):
     """
     Reduce the list of authors to those with at least `min_papers` papers for
     which they are not first authors. Ignores solo-authored papers and papers
     monetarily expensive. Feel free to edit if you have more resources.
     """
     authors = defaultdict(int)
+    for fname in os.listdir(DataPaths.CONFERENCE_DIR):
         if not fname.endswith('.json'):
             continue
+        with open(os.path.join(DataPaths.CONFERENCE_DIR, fname), 'r') as file:
             for line in file:
                 item = json.loads(line)
                 paper_authors = [x.strip() for x in item[1].split(",")]
                     authors[paper_authors[i]] += 1
     authors = {k: v for k, v in authors.items() if v >= min_papers}
+    os.makedirs(DataPaths.CONFERENCE_DIR, exist_ok=True)
+    with open(DataPaths.AUTHORS_PATH, 'w') as f:
         for k, v in authors.items():
             f.write(f"{k}\t{v}\n")
     return authors
     NOTE: cannot deal w/ interrupts and continue from past progress.
     """
+    authors = get_authors()
     person_names = list(authors.keys())
     client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
     not_us_professor_profiles = []
     def log_save_print(progress_log, us_professor_profiles, not_us_professor_profiles, i):
+        log_progress_to_file(progress_log, DataPaths.PROGRESS_LOG_PATH)
+        save_json(us_professor_profiles, DataPaths.US_PROF_PATH)
+        save_json(not_us_professor_profiles, DataPaths.NOT_US_PROF_PATH)
+        print(f"Saved profiles to {DataPaths.US_PROF_PATH} and {DataPaths.NOT_US_PROF_PATH} after processing {i} people")
     for i in range(len(person_names)):
         research_person(person_names[i], client, progress_log, us_professor_profiles, not_us_professor_profiles)
     """Searches everyone given in `person_names`."""
     # might start and stop, pull from previous efforts
     try:
+        prev_researched_authors = load_json(DataPaths.SEARCH_RESULTS_PATH)
     except:
         prev_researched_authors = []
     ignore_set = set([x[0] for x in prev_researched_authors])
             data.append([person_names[i], top_hits])
         if i % save_freq == 0:
+            save_json(data, DataPaths.SEARCH_RESULTS_PATH)
+            log_progress_to_file(progress_log, DataPaths.PROGRESS_LOG_PATH)
         # 3 queries per second max
         wait_time = max(time.time() - (query_start + 0.334), 0.0)
         time.sleep(wait_time)
+    save_json(data, DataPaths.SEARCH_RESULTS_PATH)
+    log_progress_to_file(progress_log, DataPaths.PROGRESS_LOG_PATH)
 def write_batch_files(search_results_path,
                       model="gpt-4o-mini",
                       max_tokens=1000,
                       temperature=0.0,
     batch_paths = []
     for i in range(0, len(prompt_datas) // batch_size + 1):
+        prompt_data_path = f"{DataPaths.PROMPT_DATA_PREFIX}_{i:04d}.jsonl"
         batch_range = i * batch_size, (min(len(prompt_datas), (i + 1) * batch_size))
         with open(prompt_data_path, "w") as f:
             for prompt_data in prompt_datas[batch_range[0]:batch_range[1]]:
     return batch_paths
+def send_batch_files(batch_paths, client, timeout=24*60*60):
     """Create and send the batch request to API endpoint."""
     batches = []
         batches.append(batch)
     # Keeps track of the paths to the batch files
+    with open(f"{DataPaths.PROMPT_DATA_PREFIX}_batches.pkl", "wb") as f:
         pickle.dump(batches, f)
     return batches
 def retrieve_batch_output(client, batch_id):
                     print(f"Failed to parse json object `{json_obj}`: {e2}")
                     progress_log.append(f"Failed UNKNOWN: Parsed LLM output: {e2}")
+    with open(DataPaths.US_PROF_PATH, 'w') as file:
         json.dump(us_professor_profiles, file, indent=4)
+    with open(DataPaths.NOT_US_PROF_PATH, 'w') as file:
         json.dump(not_us_professor_profiles, file, indent=4)
+def create_professor_frontend_data():
+    with open(DataPaths.US_PROF_PATH, 'r') as file:
         us_professor_profiles = json.load(file)
     professors_dict = {
         for professor in us_professor_profiles
     }
+    with open(DataPaths.FRONTEND_PROF_PATH, 'w') as file:
         json.dump(professors_dict, file)
 def main():
     args = parser.parse_args()
     if args.batch_search:
+        authors = get_authors()
         authors_list = list(authors.keys())
         print("Researching people...")
         progress_log = []
         batch_search_person(authors_list, progress_log, save_freq=20)
     elif args.batch_analyze:
         client = OpenAI()
+        batch_paths = write_batch_files(DataPaths.SEARCH_RESULTS_PATH)
+        send_batch_files(batch_paths, client)
     elif args.batch_retrieve:
         client = OpenAI()
+        with open(f"{DataPaths.PROMPT_DATA_PREFIX}_batches.pkl", "rb") as f:
             batches = pickle.load(f)
         batch_process_llm_output(client, batches)
+        create_professor_frontend_data()
     else:
         raise ValueError("Please specify --batch_search, --batch_analyze, or --batch_retrieve.")