livctr commited on
Commit
5236c5b
·
1 Parent(s): 631bbf6

update requirements.txt, add frontend_data

Browse files

Former-commit-id: 3db30844c62bbf374f4bbeba88e8f58197a33b42

.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ data/frontend_data/ filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,5 +1,6 @@
1
  .env
2
  data/*
 
3
  runs/*
4
  logs/*
5
  nbs/*
 
1
  .env
2
  data/*
3
+ !data/frontend_data/
4
  runs/*
5
  logs/*
6
  nbs/*
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/dataset_info.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "builder_name": "generator",
3
+ "citation": "",
4
+ "config_name": "default",
5
+ "dataset_name": "generator",
6
+ "dataset_size": 18496049,
7
+ "description": "",
8
+ "download_checksums": {},
9
+ "download_size": 0,
10
+ "features": {
11
+ "id": {
12
+ "dtype": "string",
13
+ "_type": "Value"
14
+ },
15
+ "title": {
16
+ "dtype": "string",
17
+ "_type": "Value"
18
+ },
19
+ "authors": {
20
+ "feature": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ },
24
+ "_type": "Sequence"
25
+ }
26
+ },
27
+ "homepage": "",
28
+ "license": "",
29
+ "size_in_bytes": 18496049,
30
+ "splits": {
31
+ "train": {
32
+ "name": "train",
33
+ "num_bytes": 18496049,
34
+ "num_examples": 12867,
35
+ "dataset_name": "generator"
36
+ }
37
+ },
38
+ "version": {
39
+ "version_str": "0.0.0",
40
+ "major": 0,
41
+ "minor": 0,
42
+ "patch": 0
43
+ }
44
+ }
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "686bda94e0c233a4",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": "train"
13
+ }
data/frontend_data/all-mpnet-base-v2-embds/weights.pt.REMOVED.git-id ADDED
@@ -0,0 +1 @@
 
 
1
+ 7b1fa83dad18b346ea03429c5a54390006e9465f
data/frontend_data/us_professor.json ADDED
The diff for this file is too large to render. See raw diff
 
data_pipeline/conference_scraper.py CHANGED
@@ -26,7 +26,6 @@ ICLR, ICCV, ECCV, ACL, NAACL, and many others.
26
  -----------
27
  """
28
 
29
- from collections import defaultdict
30
  from functools import partial
31
  import json
32
  import os
 
26
  -----------
27
  """
28
 
 
29
  from functools import partial
30
  import json
31
  import os
data_pipeline/config.py CHANGED
@@ -25,7 +25,7 @@ class DataPaths:
25
  EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
26
  PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
27
 
28
- FRONTEND_DIR = os.path.join(BASE_DIR, 'frontend_data')
29
  FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
30
  FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
31
  FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')
 
25
  EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
26
  PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
27
 
28
+ FRONTEND_DIR = os.path.join('frontend_data')
29
  FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
30
  FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
31
  FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')
data_pipeline/requirements-data-pipeline.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ dotenv
3
+ datasets
4
+ kaggle
5
+ langchain
6
+ numpy
7
+ openai
8
+ pandas
9
+ regex
10
+ torch
11
+ tqdm
data_pipeline/requirements.txt DELETED
@@ -1,15 +0,0 @@
1
- beautifulsoup4==4.12.3
2
- datasets==3.0.1
3
- kaggle==1.6.17
4
- langchain==0.3.4
5
- langchain_core==0.3.12
6
- langchain_together==0.2.0
7
- numpy
8
- openai==1.52.0
9
- pandas
10
- python-dotenv==1.0.1
11
- regex==2024.9.11
12
- Requests==2.32.3
13
- torch
14
- tqdm==4.66.4
15
- transformers==4.45.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data_pipeline/us_professor_verifier.py CHANGED
@@ -12,7 +12,6 @@ from openai import OpenAI
12
  import regex as re
13
  from tqdm import tqdm
14
 
15
- from data_pipeline.conference_scraper import get_authors
16
  from data_pipeline.config import DataPaths
17
 
18
 
 
12
  import regex as re
13
  from tqdm import tqdm
14
 
 
15
  from data_pipeline.config import DataPaths
16
 
17
 
requirements.txt CHANGED
@@ -1 +1,4 @@
1
- streamlit
 
 
 
 
1
+ datasets
2
+ streamlit
3
+ torch
4
+ transformers