Spaces:
Build error
Build error
UI changes
Browse files- src/Surveyor.py +13 -19
src/Surveyor.py
CHANGED
|
@@ -244,11 +244,10 @@ class Surveyor:
|
|
| 244 |
|
| 245 |
papers = papers_meta[:self.num_papers]
|
| 246 |
selected_papers = papers
|
| 247 |
-
self.print_fn("\n-First stage paper collection...")
|
| 248 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
| 249 |
self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
| 250 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
| 251 |
-
_ = self.get_freq_cited(cites)
|
| 252 |
'''
|
| 253 |
filtered_idlist = []
|
| 254 |
for c in self.get_freq_cited(cites):
|
|
@@ -257,7 +256,6 @@ class Surveyor:
|
|
| 257 |
new_papers.extend(new_searched_papers)
|
| 258 |
'''
|
| 259 |
selected_papers.extend(new_papers)
|
| 260 |
-
self.print_fn("\n-Second stage paper collection...")
|
| 261 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
| 262 |
self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
| 263 |
papers.extend(new_papers)
|
|
@@ -269,7 +267,7 @@ class Surveyor:
|
|
| 269 |
self.print_fn("\n-Extracting section-wise highlights.. ")
|
| 270 |
papers = self.extract_highlights(papers)
|
| 271 |
|
| 272 |
-
return papers, selected_papers
|
| 273 |
|
| 274 |
|
| 275 |
def get_freq_cited(self, cites_dict, k=5):
|
|
@@ -279,7 +277,6 @@ class Surveyor:
|
|
| 279 |
[cites_list.append(val) for val in v]
|
| 280 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
| 281 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
| 282 |
-
self.print_fn("\n-The most cited paper ids are:\n" + str(sorted_cites))
|
| 283 |
|
| 284 |
return sorted_cites.keys()
|
| 285 |
|
|
@@ -732,7 +729,7 @@ class Surveyor:
|
|
| 732 |
score = self.text_para_similarity(query, highlights)
|
| 733 |
scores.append(score)
|
| 734 |
pids.append(id)
|
| 735 |
-
self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
|
| 736 |
|
| 737 |
idx = np.argsort(scores)[:num_papers]
|
| 738 |
#for i in range(len(scores)):
|
|
@@ -747,12 +744,12 @@ class Surveyor:
|
|
| 747 |
for p in papers_selected:
|
| 748 |
self.print_fn("Selected Paper: " + p['title'])
|
| 749 |
|
| 750 |
-
self.print_fn("constrast with natural selection: forward")
|
| 751 |
-
for p in papers[:4]:
|
| 752 |
-
|
| 753 |
-
self.print_fn("constrast with natural selection: backward")
|
| 754 |
-
for p in papers[-4:]:
|
| 755 |
-
|
| 756 |
# arxiv search producing better relevnce
|
| 757 |
return papers_selected
|
| 758 |
|
|
@@ -1205,8 +1202,6 @@ class Surveyor:
|
|
| 1205 |
import arxiv
|
| 1206 |
from urllib.parse import urlparse
|
| 1207 |
ids = [p['id'] for p in papers]
|
| 1208 |
-
self.print_fn("\n-downloading below selected papers: ")
|
| 1209 |
-
self.print_fn(ids)
|
| 1210 |
# asert(False)
|
| 1211 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
| 1212 |
for p in papers_filtered:
|
|
@@ -1219,7 +1214,6 @@ class Surveyor:
|
|
| 1219 |
import arxiv
|
| 1220 |
from urllib.parse import urlparse
|
| 1221 |
ids = [p['id'] for p in papers]
|
| 1222 |
-
self.print_fn(ids)
|
| 1223 |
# asert(False)
|
| 1224 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
| 1225 |
for p in papers_filtered:
|
|
@@ -1246,10 +1240,7 @@ class Surveyor:
|
|
| 1246 |
def cocitation_network(self, papers, txt_dir):
|
| 1247 |
import multiprocessing
|
| 1248 |
|
| 1249 |
-
|
| 1250 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
| 1251 |
-
self.print_fn("\n-citation-network: ")
|
| 1252 |
-
self.print_fn(cites)
|
| 1253 |
|
| 1254 |
for p in papers:
|
| 1255 |
p['cites'] = cites[p['id']]
|
|
@@ -1370,7 +1361,7 @@ class Surveyor:
|
|
| 1370 |
# paper selection by scibert vector embedding relevance scores
|
| 1371 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
| 1372 |
|
| 1373 |
-
papers_highlighted, papers_selected = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
|
| 1374 |
searched_papers)
|
| 1375 |
|
| 1376 |
if weigh_authors:
|
|
@@ -1478,6 +1469,9 @@ class Surveyor:
|
|
| 1478 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
| 1479 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
|
| 1480 |
|
|
|
|
|
|
|
|
|
|
| 1481 |
shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
|
| 1482 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
| 1483 |
assert (os.path.exists(survey_file))
|
|
|
|
| 244 |
|
| 245 |
papers = papers_meta[:self.num_papers]
|
| 246 |
selected_papers = papers
|
|
|
|
| 247 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
| 248 |
self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
| 249 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
| 250 |
+
# _ = self.get_freq_cited(cites)
|
| 251 |
'''
|
| 252 |
filtered_idlist = []
|
| 253 |
for c in self.get_freq_cited(cites):
|
|
|
|
| 256 |
new_papers.extend(new_searched_papers)
|
| 257 |
'''
|
| 258 |
selected_papers.extend(new_papers)
|
|
|
|
| 259 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
| 260 |
self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
| 261 |
papers.extend(new_papers)
|
|
|
|
| 267 |
self.print_fn("\n-Extracting section-wise highlights.. ")
|
| 268 |
papers = self.extract_highlights(papers)
|
| 269 |
|
| 270 |
+
return papers, selected_papers, cites
|
| 271 |
|
| 272 |
|
| 273 |
def get_freq_cited(self, cites_dict, k=5):
|
|
|
|
| 277 |
[cites_list.append(val) for val in v]
|
| 278 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
| 279 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
|
|
|
| 280 |
|
| 281 |
return sorted_cites.keys()
|
| 282 |
|
|
|
|
| 729 |
score = self.text_para_similarity(query, highlights)
|
| 730 |
scores.append(score)
|
| 731 |
pids.append(id)
|
| 732 |
+
# self.print_fn("corpus item: " + str(self.get_by_pid(id, papers)['title']))
|
| 733 |
|
| 734 |
idx = np.argsort(scores)[:num_papers]
|
| 735 |
#for i in range(len(scores)):
|
|
|
|
| 744 |
for p in papers_selected:
|
| 745 |
self.print_fn("Selected Paper: " + p['title'])
|
| 746 |
|
| 747 |
+
#self.print_fn("constrast with natural selection: forward")
|
| 748 |
+
#for p in papers[:4]:
|
| 749 |
+
# self.print_fn("Selected Paper: " + p['title'])
|
| 750 |
+
#self.print_fn("constrast with natural selection: backward")
|
| 751 |
+
#for p in papers[-4:]:
|
| 752 |
+
# self.print_fn("Selected Paper: " + p['title'])
|
| 753 |
# arxiv search producing better relevnce
|
| 754 |
return papers_selected
|
| 755 |
|
|
|
|
| 1202 |
import arxiv
|
| 1203 |
from urllib.parse import urlparse
|
| 1204 |
ids = [p['id'] for p in papers]
|
|
|
|
|
|
|
| 1205 |
# asert(False)
|
| 1206 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
| 1207 |
for p in papers_filtered:
|
|
|
|
| 1214 |
import arxiv
|
| 1215 |
from urllib.parse import urlparse
|
| 1216 |
ids = [p['id'] for p in papers]
|
|
|
|
| 1217 |
# asert(False)
|
| 1218 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
| 1219 |
for p in papers_filtered:
|
|
|
|
| 1240 |
def cocitation_network(self, papers, txt_dir):
|
| 1241 |
import multiprocessing
|
| 1242 |
|
|
|
|
| 1243 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
|
|
|
|
|
|
| 1244 |
|
| 1245 |
for p in papers:
|
| 1246 |
p['cites'] = cites[p['id']]
|
|
|
|
| 1361 |
# paper selection by scibert vector embedding relevance scores
|
| 1362 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
| 1363 |
|
| 1364 |
+
papers_highlighted, papers_selected, cites = self.pdf_route(self.pdf_dir, self.txt_dir, self.img_dir, self.tab_dir, self.dump_dir,
|
| 1365 |
searched_papers)
|
| 1366 |
|
| 1367 |
if weigh_authors:
|
|
|
|
| 1469 |
survey_file = 'A_Survey_on_' + query.replace(' ', '_') + '.txt'
|
| 1470 |
self.build_doc(clustered_sections, papers_standardized, query=query, filename=self.dump_dir + survey_file)
|
| 1471 |
|
| 1472 |
+
self.survey_print_fn("\n-citation-network: ")
|
| 1473 |
+
self.survey_print_fn(cites)
|
| 1474 |
+
|
| 1475 |
shutil.copytree('arxiv_data/', self.dump_dir + '/arxiv_data/')
|
| 1476 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
| 1477 |
assert (os.path.exists(survey_file))
|