Spaces:
Sleeping
Sleeping
update requirements.txt, add frontend_data
Browse filesFormer-commit-id: 3db30844c62bbf374f4bbeba88e8f58197a33b42
- .gitattributes +1 -0
- .gitignore +1 -0
- data/frontend_data/all-mpnet-base-v2-embds/id_title_author/dataset_info.json +44 -0
- data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json +13 -0
- data/frontend_data/all-mpnet-base-v2-embds/weights.pt.REMOVED.git-id +1 -0
- data/frontend_data/us_professor.json +0 -0
- data_pipeline/conference_scraper.py +0 -1
- data_pipeline/config.py +1 -1
- data_pipeline/requirements-data-pipeline.txt +11 -0
- data_pipeline/requirements.txt +0 -15
- data_pipeline/us_professor_verifier.py +0 -1
- requirements.txt +4 -1
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
data/frontend_data/ filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
.env
|
| 2 |
data/*
|
|
|
|
| 3 |
runs/*
|
| 4 |
logs/*
|
| 5 |
nbs/*
|
|
|
|
| 1 |
.env
|
| 2 |
data/*
|
| 3 |
+
!data/frontend_data/
|
| 4 |
runs/*
|
| 5 |
logs/*
|
| 6 |
nbs/*
|
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/dataset_info.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"builder_name": "generator",
|
| 3 |
+
"citation": "",
|
| 4 |
+
"config_name": "default",
|
| 5 |
+
"dataset_name": "generator",
|
| 6 |
+
"dataset_size": 18496049,
|
| 7 |
+
"description": "",
|
| 8 |
+
"download_checksums": {},
|
| 9 |
+
"download_size": 0,
|
| 10 |
+
"features": {
|
| 11 |
+
"id": {
|
| 12 |
+
"dtype": "string",
|
| 13 |
+
"_type": "Value"
|
| 14 |
+
},
|
| 15 |
+
"title": {
|
| 16 |
+
"dtype": "string",
|
| 17 |
+
"_type": "Value"
|
| 18 |
+
},
|
| 19 |
+
"authors": {
|
| 20 |
+
"feature": {
|
| 21 |
+
"dtype": "string",
|
| 22 |
+
"_type": "Value"
|
| 23 |
+
},
|
| 24 |
+
"_type": "Sequence"
|
| 25 |
+
}
|
| 26 |
+
},
|
| 27 |
+
"homepage": "",
|
| 28 |
+
"license": "",
|
| 29 |
+
"size_in_bytes": 18496049,
|
| 30 |
+
"splits": {
|
| 31 |
+
"train": {
|
| 32 |
+
"name": "train",
|
| 33 |
+
"num_bytes": 18496049,
|
| 34 |
+
"num_examples": 12867,
|
| 35 |
+
"dataset_name": "generator"
|
| 36 |
+
}
|
| 37 |
+
},
|
| 38 |
+
"version": {
|
| 39 |
+
"version_str": "0.0.0",
|
| 40 |
+
"major": 0,
|
| 41 |
+
"minor": 0,
|
| 42 |
+
"patch": 0
|
| 43 |
+
}
|
| 44 |
+
}
|
data/frontend_data/all-mpnet-base-v2-embds/id_title_author/state.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "686bda94e0c233a4",
|
| 8 |
+
"_format_columns": null,
|
| 9 |
+
"_format_kwargs": {},
|
| 10 |
+
"_format_type": null,
|
| 11 |
+
"_output_all_columns": false,
|
| 12 |
+
"_split": "train"
|
| 13 |
+
}
|
data/frontend_data/all-mpnet-base-v2-embds/weights.pt.REMOVED.git-id
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
7b1fa83dad18b346ea03429c5a54390006e9465f
|
data/frontend_data/us_professor.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data_pipeline/conference_scraper.py
CHANGED
|
@@ -26,7 +26,6 @@ ICLR, ICCV, ECCV, ACL, NAACL, and many others.
|
|
| 26 |
-----------
|
| 27 |
"""
|
| 28 |
|
| 29 |
-
from collections import defaultdict
|
| 30 |
from functools import partial
|
| 31 |
import json
|
| 32 |
import os
|
|
|
|
| 26 |
-----------
|
| 27 |
"""
|
| 28 |
|
|
|
|
| 29 |
from functools import partial
|
| 30 |
import json
|
| 31 |
import os
|
data_pipeline/config.py
CHANGED
|
@@ -25,7 +25,7 @@ class DataPaths:
|
|
| 25 |
EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
|
| 26 |
PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
|
| 27 |
|
| 28 |
-
FRONTEND_DIR = os.path.join(
|
| 29 |
FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
|
| 30 |
FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
|
| 31 |
FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')
|
|
|
|
| 25 |
EMBD_PATH = os.path.join(PAPER_DIR, EMBD_MODEL)
|
| 26 |
PAPER_DATA_PATH = os.path.join(PAPER_DIR, "paper_data")
|
| 27 |
|
| 28 |
+
FRONTEND_DIR = os.path.join('frontend_data')
|
| 29 |
FRONTEND_PROF_PATH = os.path.join(FRONTEND_DIR, 'us_professor.json')
|
| 30 |
FRONTEND_EMBD_PATH = os.path.join(FRONTEND_DIR, EMBD_MODEL) # contains id, title, author, weights
|
| 31 |
FRONTEND_ITA_PATH = os.path.join(FRONTEND_EMBD_PATH, 'id_title_author')
|
data_pipeline/requirements-data-pipeline.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beautifulsoup4
|
| 2 |
+
dotenv
|
| 3 |
+
datasets
|
| 4 |
+
kaggle
|
| 5 |
+
langchain
|
| 6 |
+
numpy
|
| 7 |
+
openai
|
| 8 |
+
pandas
|
| 9 |
+
regex
|
| 10 |
+
torch
|
| 11 |
+
tqdm
|
data_pipeline/requirements.txt
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
beautifulsoup4==4.12.3
|
| 2 |
-
datasets==3.0.1
|
| 3 |
-
kaggle==1.6.17
|
| 4 |
-
langchain==0.3.4
|
| 5 |
-
langchain_core==0.3.12
|
| 6 |
-
langchain_together==0.2.0
|
| 7 |
-
numpy
|
| 8 |
-
openai==1.52.0
|
| 9 |
-
pandas
|
| 10 |
-
python-dotenv==1.0.1
|
| 11 |
-
regex==2024.9.11
|
| 12 |
-
Requests==2.32.3
|
| 13 |
-
torch
|
| 14 |
-
tqdm==4.66.4
|
| 15 |
-
transformers==4.45.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_pipeline/us_professor_verifier.py
CHANGED
|
@@ -12,7 +12,6 @@ from openai import OpenAI
|
|
| 12 |
import regex as re
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
-
from data_pipeline.conference_scraper import get_authors
|
| 16 |
from data_pipeline.config import DataPaths
|
| 17 |
|
| 18 |
|
|
|
|
| 12 |
import regex as re
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
|
|
|
| 15 |
from data_pipeline.config import DataPaths
|
| 16 |
|
| 17 |
|
requirements.txt
CHANGED
|
@@ -1 +1,4 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets
|
| 2 |
+
streamlit
|
| 3 |
+
torch
|
| 4 |
+
transformers
|