Spaces:
Build error
Build error
fixes
Browse files- src/Surveyor.py +20 -19
src/Surveyor.py
CHANGED
|
@@ -18,7 +18,7 @@ except:
|
|
| 18 |
|
| 19 |
from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
|
| 20 |
|
| 21 |
-
|
| 22 |
|
| 23 |
class Surveyor:
|
| 24 |
'''
|
|
@@ -79,15 +79,16 @@ class Surveyor:
|
|
| 79 |
spacy.require_gpu()
|
| 80 |
|
| 81 |
self.high_gpu = high_gpu
|
|
|
|
| 82 |
if self.high_gpu:
|
| 83 |
-
DEFAULTS = DEFAULTS_HIGH_GPU
|
| 84 |
|
| 85 |
if not kw_model_name:
|
| 86 |
-
kw_model_name = DEFAULTS["kw_model_name"]
|
| 87 |
-
self.num_papers = DEFAULTS['num_papers']
|
| 88 |
-
self.max_search = DEFAULTS['max_search']
|
| 89 |
if not models_dir:
|
| 90 |
-
models_dir = DEFAULTS['models_dir']
|
| 91 |
|
| 92 |
models_found = False
|
| 93 |
if os.path.exists(models_dir) and not no_save_models:
|
|
@@ -95,17 +96,17 @@ class Surveyor:
|
|
| 95 |
models_found = True
|
| 96 |
|
| 97 |
if not title_model_name:
|
| 98 |
-
title_model_name = DEFAULTS["title_model_name"]
|
| 99 |
if not ex_summ_model_name:
|
| 100 |
-
ex_summ_model_name = DEFAULTS["ex_summ_model_name"]
|
| 101 |
if not ledmodel_name:
|
| 102 |
-
ledmodel_name = DEFAULTS["ledmodel_name"]
|
| 103 |
if not embedder_name:
|
| 104 |
-
embedder_name = DEFAULTS["embedder_name"]
|
| 105 |
if not nlp_name:
|
| 106 |
-
nlp_name = DEFAULTS["nlp_name"]
|
| 107 |
if not similarity_nlp_name:
|
| 108 |
-
similarity_nlp_name = DEFAULTS["similarity_nlp_name"]
|
| 109 |
|
| 110 |
if refresh_models or not models_found:
|
| 111 |
print(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
|
|
@@ -183,27 +184,27 @@ class Surveyor:
|
|
| 183 |
if pdf_dir:
|
| 184 |
self.pdf_dir = pdf_dir
|
| 185 |
else:
|
| 186 |
-
self.pdf_dir = DEFAULTS["pdf_dir"]
|
| 187 |
|
| 188 |
if txt_dir:
|
| 189 |
self.txt_dir = txt_dir
|
| 190 |
else:
|
| 191 |
-
self.txt_dir = DEFAULTS["txt_dir"]
|
| 192 |
|
| 193 |
if img_dir:
|
| 194 |
self.img_dir = img_dir
|
| 195 |
else:
|
| 196 |
-
self.img_dir = DEFAULTS["img_dir"]
|
| 197 |
|
| 198 |
if tab_dir:
|
| 199 |
self.tab_dir = tab_dir
|
| 200 |
else:
|
| 201 |
-
self.tab_dir = DEFAULTS["tab_dir"]
|
| 202 |
|
| 203 |
if dump_dir:
|
| 204 |
self.dump_dir = dump_dir
|
| 205 |
else:
|
| 206 |
-
self.dump_dir = DEFAULTS["dump_dir"]
|
| 207 |
|
| 208 |
dirs = [self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir]
|
| 209 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
|
@@ -1337,9 +1338,9 @@ class Surveyor:
|
|
| 1337 |
import joblib
|
| 1338 |
import os, shutil
|
| 1339 |
if not max_search:
|
| 1340 |
-
max_search = DEFAULTS['max_search']
|
| 1341 |
if not num_papers:
|
| 1342 |
-
num_papers = DEFAULTS['num_papers']
|
| 1343 |
# arxiv api relevance search and data preparation
|
| 1344 |
print("\nsearching arXiv for top 100 papers.. ")
|
| 1345 |
results, searched_papers = self.search(query, max_search=max_search)
|
|
|
|
| 18 |
|
| 19 |
from src.defaults import DEFAULTS_CPU_COMPAT, DEFAULTS_HIGH_GPU
|
| 20 |
|
| 21 |
+
|
| 22 |
|
| 23 |
class Surveyor:
|
| 24 |
'''
|
|
|
|
| 79 |
spacy.require_gpu()
|
| 80 |
|
| 81 |
self.high_gpu = high_gpu
|
| 82 |
+
self.DEFAULTS = DEFAULTS_CPU_COMPAT
|
| 83 |
if self.high_gpu:
|
| 84 |
+
self.DEFAULTS = DEFAULTS_HIGH_GPU
|
| 85 |
|
| 86 |
if not kw_model_name:
|
| 87 |
+
kw_model_name = self.DEFAULTS["kw_model_name"]
|
| 88 |
+
self.num_papers = self.DEFAULTS['num_papers']
|
| 89 |
+
self.max_search = self.DEFAULTS['max_search']
|
| 90 |
if not models_dir:
|
| 91 |
+
models_dir = self.DEFAULTS['models_dir']
|
| 92 |
|
| 93 |
models_found = False
|
| 94 |
if os.path.exists(models_dir) and not no_save_models:
|
|
|
|
| 96 |
models_found = True
|
| 97 |
|
| 98 |
if not title_model_name:
|
| 99 |
+
title_model_name = self.DEFAULTS["title_model_name"]
|
| 100 |
if not ex_summ_model_name:
|
| 101 |
+
ex_summ_model_name = self.DEFAULTS["ex_summ_model_name"]
|
| 102 |
if not ledmodel_name:
|
| 103 |
+
ledmodel_name = self.DEFAULTS["ledmodel_name"]
|
| 104 |
if not embedder_name:
|
| 105 |
+
embedder_name = self.DEFAULTS["embedder_name"]
|
| 106 |
if not nlp_name:
|
| 107 |
+
nlp_name = self.DEFAULTS["nlp_name"]
|
| 108 |
if not similarity_nlp_name:
|
| 109 |
+
similarity_nlp_name = self.DEFAULTS["similarity_nlp_name"]
|
| 110 |
|
| 111 |
if refresh_models or not models_found:
|
| 112 |
print(f'\nInitializing models {"and saving (about 5GB)" if not no_save_models else ""}')
|
|
|
|
| 184 |
if pdf_dir:
|
| 185 |
self.pdf_dir = pdf_dir
|
| 186 |
else:
|
| 187 |
+
self.pdf_dir = self.DEFAULTS["pdf_dir"]
|
| 188 |
|
| 189 |
if txt_dir:
|
| 190 |
self.txt_dir = txt_dir
|
| 191 |
else:
|
| 192 |
+
self.txt_dir = self.DEFAULTS["txt_dir"]
|
| 193 |
|
| 194 |
if img_dir:
|
| 195 |
self.img_dir = img_dir
|
| 196 |
else:
|
| 197 |
+
self.img_dir = self.DEFAULTS["img_dir"]
|
| 198 |
|
| 199 |
if tab_dir:
|
| 200 |
self.tab_dir = tab_dir
|
| 201 |
else:
|
| 202 |
+
self.tab_dir = self.DEFAULTS["tab_dir"]
|
| 203 |
|
| 204 |
if dump_dir:
|
| 205 |
self.dump_dir = dump_dir
|
| 206 |
else:
|
| 207 |
+
self.dump_dir = self.DEFAULTS["dump_dir"]
|
| 208 |
|
| 209 |
dirs = [self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir]
|
| 210 |
if sum([True for dir in dirs if 'arxiv_data/' in dir]):
|
|
|
|
| 1338 |
import joblib
|
| 1339 |
import os, shutil
|
| 1340 |
if not max_search:
|
| 1341 |
+
max_search = self.DEFAULTS['max_search']
|
| 1342 |
if not num_papers:
|
| 1343 |
+
num_papers = self.DEFAULTS['num_papers']
|
| 1344 |
# arxiv api relevance search and data preparation
|
| 1345 |
print("\nsearching arXiv for top 100 papers.. ")
|
| 1346 |
results, searched_papers = self.search(query, max_search=max_search)
|