update requirements.txt, add frontend_data

Former-commit-id: 3db30844c62bbf374f4bbeba88e8f58197a33b42

Files changed (12) hide show

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ data/frontend_data/ filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 .env
 data/*
 runs/*
 logs/*
 nbs/*

 .env
 data/*
+!data/frontend_data/
 runs/*
 logs/*
 nbs/*

data/frontend_data/all-mpnet-base-v2-embds/id_title_author/dataset_info.json ADDED Viewed

+{
+  "builder_name": "generator",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "generator",
+  "dataset_size": 18496049,
+  "description": "",
+  "download_checksums": {},
+  "download_size": 0,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "title": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "authors": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 18496049,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 18496049,
+      "num_examples": 12867,
+      "dataset_name": "generator"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json ADDED Viewed

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "686bda94e0c233a4",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

data/frontend_data/all-mpnet-base-v2-embds/weights.pt.REMOVED.git-id ADDED Viewed

	@@ -0,0 +1 @@


1	+ 7b1fa83dad18b346ea03429c5a54390006e9465f

data/frontend_data/us_professor.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data_pipeline/conference_scraper.py CHANGED Viewed

@@ -26,7 +26,6 @@ ICLR, ICCV, ECCV, ACL, NAACL, and many others.
 -----------
 """
-from collections import defaultdict
 from functools import partial
 import json
 import os

 -----------
 """
 from functools import partial
 import json
 import os

data_pipeline/config.py CHANGED Viewed

@@ -25,7 +25,7 @@ class DataPaths:
     EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
     PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
-    FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
     FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
     FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL)  # contains id, title, author, weights
     FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')

     EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
     PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
+    FRONTEND_DIR = os.path.join('frontend_data')
     FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
     FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL)  # contains id, title, author, weights
     FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')

data_pipeline/requirements-data-pipeline.txt ADDED Viewed

+beautifulsoup4
+dotenv
+datasets
+kaggle
+langchain
+numpy
+openai
+pandas
+regex
+torch
+tqdm

data_pipeline/requirements.txt DELETED Viewed

@@ -1,15 +0,0 @@
-beautifulsoup4==4.12.3
-datasets==3.0.1
-kaggle==1.6.17
-langchain==0.3.4
-langchain_core==0.3.12
-langchain_together==0.2.0
-numpy
-openai==1.52.0
-pandas
-python-dotenv==1.0.1
-regex==2024.9.11
-Requests==2.32.3
-torch
-tqdm==4.66.4
-transformers==4.45.2

data_pipeline/us_professor_verifier.py CHANGED Viewed

@@ -12,7 +12,6 @@ from openai import OpenAI
 import regex as re
 from tqdm import tqdm
-from data_pipeline.conference_scraper import get_authors
 from data_pipeline.config import DataPaths

 import regex as re
 from tqdm import tqdm
 from data_pipeline.config import DataPaths

requirements.txt CHANGED Viewed

+datasets
+streamlit
+torch
+transformers