Spaces:
Build error
Build error
print fix
Browse files- app.py +1 -1
- src/Surveyor.py +30 -31
app.py
CHANGED
|
@@ -28,7 +28,6 @@ def run_survey(surveyor, research_keywords, max_search, num_papers):
|
|
| 28 |
|
| 29 |
|
| 30 |
def survey_space(surveyor):
|
| 31 |
-
st.container().title('Auto-Research V0.1 - Automated Survey generation from research keywords')
|
| 32 |
form = st.sidebar.form(key='survey_form')
|
| 33 |
research_keywords = form.text_input("What would you like to research in today?")
|
| 34 |
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
|
@@ -42,6 +41,7 @@ def survey_space(surveyor):
|
|
| 42 |
|
| 43 |
|
| 44 |
if __name__ == '__main__':
|
|
|
|
| 45 |
global surveyor
|
| 46 |
surveyor_obj = Surveyor(print_fn=st.write)
|
| 47 |
survey_space(surveyor_obj)
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def survey_space(surveyor):
|
|
|
|
| 31 |
form = st.sidebar.form(key='survey_form')
|
| 32 |
research_keywords = form.text_input("What would you like to research in today?")
|
| 33 |
max_search = form.number_input("num_papers_to_search", help="maximium number of papers to glance through - defaults to 20",
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
if __name__ == '__main__':
|
| 44 |
+
st.container().title('Auto-Research V0.1 - Automated Survey generation from research keywords')
|
| 45 |
global surveyor
|
| 46 |
surveyor_obj = Surveyor(print_fn=st.write)
|
| 47 |
survey_space(surveyor_obj)
|
src/Surveyor.py
CHANGED
|
@@ -75,11 +75,10 @@ class Surveyor:
|
|
| 75 |
self.print_fn = print
|
| 76 |
if print_fn is not None:
|
| 77 |
self.print_fn = print_fn
|
| 78 |
-
|
| 79 |
self.torch_device = 'cpu'
|
| 80 |
self.print_fn("\nTorch_device: " + self.torch_device)
|
| 81 |
if torch.cuda.is_available():
|
| 82 |
-
self.print_fn("\nloading defaults for gpu")
|
| 83 |
self.torch_device = 'cuda'
|
| 84 |
spacy.require_gpu()
|
| 85 |
|
|
@@ -153,7 +152,7 @@ class Surveyor:
|
|
| 153 |
if not no_save_models:
|
| 154 |
self.embedder.save(models_dir + "/embedder")
|
| 155 |
else:
|
| 156 |
-
self.print_fn("\
|
| 157 |
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
| 158 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
| 159 |
self.title_model.eval()
|
|
@@ -235,9 +234,9 @@ class Surveyor:
|
|
| 235 |
|
| 236 |
papers = papers_meta[:self.num_papers]
|
| 237 |
selected_papers = papers
|
| 238 |
-
self.print_fn("\
|
| 239 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
| 240 |
-
self.print_fn("\
|
| 241 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
| 242 |
_ = self.get_freq_cited(cites)
|
| 243 |
'''
|
|
@@ -248,16 +247,16 @@ class Surveyor:
|
|
| 248 |
new_papers.extend(new_searched_papers)
|
| 249 |
'''
|
| 250 |
selected_papers.extend(new_papers)
|
| 251 |
-
self.print_fn("\
|
| 252 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
| 253 |
-
self.print_fn("\
|
| 254 |
papers.extend(new_papers)
|
| 255 |
|
| 256 |
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
| 257 |
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
| 258 |
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
| 259 |
|
| 260 |
-
self.print_fn("\
|
| 261 |
papers = self.extract_highlights(papers)
|
| 262 |
|
| 263 |
return papers, selected_papers
|
|
@@ -270,7 +269,7 @@ class Surveyor:
|
|
| 270 |
[cites_list.append(val) for val in v]
|
| 271 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
| 272 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
| 273 |
-
self.print_fn("\
|
| 274 |
|
| 275 |
return sorted_cites.keys()
|
| 276 |
|
|
@@ -333,11 +332,11 @@ class Surveyor:
|
|
| 333 |
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
| 334 |
|
| 335 |
import arxiv2bib
|
| 336 |
-
self.print_fn("\
|
| 337 |
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
| 338 |
bibentries = [r.bibtex() for r in bibentries]
|
| 339 |
|
| 340 |
-
self.print_fn("\
|
| 341 |
file = open(filename, 'w+')
|
| 342 |
if query is None:
|
| 343 |
query = 'Internal(existing) research'
|
|
@@ -768,7 +767,7 @@ class Surveyor:
|
|
| 768 |
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
| 769 |
res_doc = self.nlp(res)
|
| 770 |
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
| 771 |
-
# self.print_fn("\n".join(res_sents))
|
| 772 |
with torch.no_grad():
|
| 773 |
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
| 774 |
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
|
@@ -794,14 +793,14 @@ class Surveyor:
|
|
| 794 |
return papers
|
| 795 |
|
| 796 |
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
| 797 |
-
self.print_fn("\
|
| 798 |
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
| 799 |
|
| 800 |
-
self.print_fn("\
|
| 801 |
papers = self.extract_images(papers, pdf_dir, img_dir)
|
| 802 |
|
| 803 |
if tables:
|
| 804 |
-
self.print_fn("\
|
| 805 |
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
| 806 |
|
| 807 |
return papers, ids_none
|
|
@@ -1057,7 +1056,7 @@ class Surveyor:
|
|
| 1057 |
for p in papers:
|
| 1058 |
if p['id'] == pid:
|
| 1059 |
return p
|
| 1060 |
-
self.print_fn("\
|
| 1061 |
|
| 1062 |
|
| 1063 |
def alpha_length(self, s):
|
|
@@ -1191,7 +1190,7 @@ class Surveyor:
|
|
| 1191 |
else:
|
| 1192 |
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
| 1193 |
|
| 1194 |
-
self.print_fn("\
|
| 1195 |
|
| 1196 |
return results, searched_papers
|
| 1197 |
|
|
@@ -1199,7 +1198,7 @@ class Surveyor:
|
|
| 1199 |
import arxiv
|
| 1200 |
from urllib.parse import urlparse
|
| 1201 |
ids = [p['id'] for p in papers]
|
| 1202 |
-
self.print_fn("\
|
| 1203 |
self.print_fn(ids)
|
| 1204 |
# asert(False)
|
| 1205 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
|
@@ -1242,7 +1241,7 @@ class Surveyor:
|
|
| 1242 |
|
| 1243 |
|
| 1244 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
| 1245 |
-
self.print_fn("\
|
| 1246 |
self.print_fn(cites)
|
| 1247 |
|
| 1248 |
for p in papers:
|
|
@@ -1354,10 +1353,10 @@ class Surveyor:
|
|
| 1354 |
if not num_papers:
|
| 1355 |
num_papers = self.DEFAULTS['num_papers']
|
| 1356 |
# arxiv api relevance search and data preparation
|
| 1357 |
-
self.print_fn("\
|
| 1358 |
results, searched_papers = self.search(query, max_search=max_search)
|
| 1359 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
| 1360 |
-
self.print_fn("\
|
| 1361 |
|
| 1362 |
# paper selection by scibert vector embedding relevance scores
|
| 1363 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
|
@@ -1370,23 +1369,23 @@ class Surveyor:
|
|
| 1370 |
|
| 1371 |
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
| 1372 |
|
| 1373 |
-
self.print_fn("\
|
| 1374 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
| 1375 |
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
| 1376 |
|
| 1377 |
-
self.print_fn("\
|
| 1378 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
| 1379 |
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
| 1380 |
|
| 1381 |
-
self.print_fn("\
|
| 1382 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
| 1383 |
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
| 1384 |
|
| 1385 |
-
self.print_fn("\
|
| 1386 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
| 1387 |
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
| 1388 |
|
| 1389 |
-
self.print_fn("\
|
| 1390 |
corpus_lines = self.get_corpus_lines(corpus)
|
| 1391 |
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
| 1392 |
|
|
@@ -1420,7 +1419,7 @@ class Surveyor:
|
|
| 1420 |
'''
|
| 1421 |
# self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
|
| 1422 |
|
| 1423 |
-
self.print_fn("\
|
| 1424 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
| 1425 |
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
| 1426 |
'''
|
|
@@ -1429,7 +1428,7 @@ class Surveyor:
|
|
| 1429 |
self.print_fn(abstract_block)
|
| 1430 |
'''
|
| 1431 |
|
| 1432 |
-
self.print_fn("\
|
| 1433 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
| 1434 |
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
| 1435 |
'''
|
|
@@ -1437,7 +1436,7 @@ class Surveyor:
|
|
| 1437 |
self.print_fn("intro_block:")
|
| 1438 |
self.print_fn(intro_block)
|
| 1439 |
'''
|
| 1440 |
-
self.print_fn("\
|
| 1441 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
| 1442 |
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
| 1443 |
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
|
@@ -1455,7 +1454,7 @@ class Surveyor:
|
|
| 1455 |
clustered_sections['introduction'] = intro_block
|
| 1456 |
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
| 1457 |
|
| 1458 |
-
self.print_fn("\
|
| 1459 |
conclusion_block = self.get_conclusion(clustered_sections)
|
| 1460 |
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
| 1461 |
clustered_sections['conclusion'] = conclusion_block
|
|
@@ -1472,7 +1471,7 @@ class Surveyor:
|
|
| 1472 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
| 1473 |
assert (os.path.exists(survey_file))
|
| 1474 |
output_zip = self.zip_outputs(self.dump_dir, query)
|
| 1475 |
-
self.print_fn("\
|
| 1476 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
| 1477 |
|
| 1478 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|
|
|
|
| 75 |
self.print_fn = print
|
| 76 |
if print_fn is not None:
|
| 77 |
self.print_fn = print_fn
|
| 78 |
+
|
| 79 |
self.torch_device = 'cpu'
|
| 80 |
self.print_fn("\nTorch_device: " + self.torch_device)
|
| 81 |
if torch.cuda.is_available():
|
|
|
|
| 82 |
self.torch_device = 'cuda'
|
| 83 |
spacy.require_gpu()
|
| 84 |
|
|
|
|
| 152 |
if not no_save_models:
|
| 153 |
self.embedder.save(models_dir + "/embedder")
|
| 154 |
else:
|
| 155 |
+
self.print_fn("\n-Initializing from previously saved models at" + models_dir)
|
| 156 |
self.title_tokenizer = AutoTokenizer.from_pretrained(title_model_name)
|
| 157 |
self.title_model = AutoModelForSeq2SeqLM.from_pretrained(models_dir + "/title_model").to(self.torch_device)
|
| 158 |
self.title_model.eval()
|
|
|
|
| 234 |
|
| 235 |
papers = papers_meta[:self.num_papers]
|
| 236 |
selected_papers = papers
|
| 237 |
+
self.print_fn("\n-First stage paper collection...")
|
| 238 |
ids_none, papers, cites = self.fetch_papers(dump_dir, img_dir, papers, pdf_dir, tab_dir, txt_dir)
|
| 239 |
+
self.print_fn("\n-First stage paper collection complete, papers collected: \n" + ', '.join([p['id'] for p in papers]))
|
| 240 |
new_papers = papers_meta[self.num_papers : self.num_papers + len(ids_none)]
|
| 241 |
_ = self.get_freq_cited(cites)
|
| 242 |
'''
|
|
|
|
| 247 |
new_papers.extend(new_searched_papers)
|
| 248 |
'''
|
| 249 |
selected_papers.extend(new_papers)
|
| 250 |
+
self.print_fn("\n-Second stage paper collection...")
|
| 251 |
_, new_papers, _ = self.fetch_papers(dump_dir, img_dir, new_papers, pdf_dir, tab_dir, txt_dir, repeat=True)
|
| 252 |
+
self.print_fn("\n-Second stage paper collection complete, new papers collected: \n" + ', '.join([p['id'] for p in new_papers]))
|
| 253 |
papers.extend(new_papers)
|
| 254 |
|
| 255 |
joblib.dump(papers, dump_dir + 'papers_extracted_pdf_route.dmp')
|
| 256 |
copy_tree(img_dir, dump_dir + os.path.basename(img_dir))
|
| 257 |
copy_tree(tab_dir, dump_dir + os.path.basename(tab_dir))
|
| 258 |
|
| 259 |
+
self.print_fn("\n-Extracting section-wise highlights.. ")
|
| 260 |
papers = self.extract_highlights(papers)
|
| 261 |
|
| 262 |
return papers, selected_papers
|
|
|
|
| 269 |
[cites_list.append(val) for val in v]
|
| 270 |
cite_freqs = {cite: cites_list.count(cite) for cite in set(cites_list)}
|
| 271 |
sorted_cites = dict(sorted(cite_freqs.items(), key=lambda item: item[1], reverse=True)[:5])
|
| 272 |
+
self.print_fn("\n-The most cited paper ids are:\n" + str(sorted_cites))
|
| 273 |
|
| 274 |
return sorted_cites.keys()
|
| 275 |
|
|
|
|
| 332 |
def build_doc(self, research_sections, papers, query=None, filename='survey.txt'):
|
| 333 |
|
| 334 |
import arxiv2bib
|
| 335 |
+
self.print_fn("\n-building bibliography entries.. ")
|
| 336 |
bibentries = arxiv2bib.arxiv2bib([p['id'] for p in papers])
|
| 337 |
bibentries = [r.bibtex() for r in bibentries]
|
| 338 |
|
| 339 |
+
self.print_fn("\n-building final survey file .. at "+ filename)
|
| 340 |
file = open(filename, 'w+')
|
| 341 |
if query is None:
|
| 342 |
query = 'Internal(existing) research'
|
|
|
|
| 767 |
res = self.model(" ".join([l.lower() for l in lines]), ratio=0.5, )
|
| 768 |
res_doc = self.nlp(res)
|
| 769 |
res_lines = set([str(sent) for sent in list(res_doc.sents)])
|
| 770 |
+
# self.print_fn("\n-".join(res_sents))
|
| 771 |
with torch.no_grad():
|
| 772 |
keywords = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])), stop_words='english')
|
| 773 |
keyphrases = self.kw_model.extract_keywords(str(" ".join([l.lower() for l in lines])),
|
|
|
|
| 793 |
return papers
|
| 794 |
|
| 795 |
def extract_structure(self, papers, pdf_dir, txt_dir, img_dir, dump_dir, tab_dir, tables=False):
|
| 796 |
+
self.print_fn("\n-extracting sections.. ")
|
| 797 |
papers, ids_none = self.extract_parts(papers, txt_dir, dump_dir)
|
| 798 |
|
| 799 |
+
self.print_fn("\n-extracting images.. for future correlation use-cases ")
|
| 800 |
papers = self.extract_images(papers, pdf_dir, img_dir)
|
| 801 |
|
| 802 |
if tables:
|
| 803 |
+
self.print_fn("\n-extracting tables.. for future correlation use-cases ")
|
| 804 |
papers = self.extract_tables(papers, pdf_dir, tab_dir)
|
| 805 |
|
| 806 |
return papers, ids_none
|
|
|
|
| 1056 |
for p in papers:
|
| 1057 |
if p['id'] == pid:
|
| 1058 |
return p
|
| 1059 |
+
self.print_fn("\n-paper not found by file, \nfile: "+file+"\nall papers: "+', '.join([p['id'] for p in papers]))
|
| 1060 |
|
| 1061 |
|
| 1062 |
def alpha_length(self, s):
|
|
|
|
| 1190 |
else:
|
| 1191 |
discarded_ids.append(urlparse(result.entry_id).path.split('/')[-1].split('v')[0])
|
| 1192 |
|
| 1193 |
+
self.print_fn("\n-Papers discarded due to id error [arxiv api bug: #74] :\n" + str(discarded_ids))
|
| 1194 |
|
| 1195 |
return results, searched_papers
|
| 1196 |
|
|
|
|
| 1198 |
import arxiv
|
| 1199 |
from urllib.parse import urlparse
|
| 1200 |
ids = [p['id'] for p in papers]
|
| 1201 |
+
self.print_fn("\n-downloading below selected papers: ")
|
| 1202 |
self.print_fn(ids)
|
| 1203 |
# asert(False)
|
| 1204 |
papers_filtered = arxiv.Search(id_list=ids).get()
|
|
|
|
| 1241 |
|
| 1242 |
|
| 1243 |
cites = internal_citations.citation_list_parallel(N=multiprocessing.cpu_count(), directory=txt_dir)
|
| 1244 |
+
self.print_fn("\n-citation-network: ")
|
| 1245 |
self.print_fn(cites)
|
| 1246 |
|
| 1247 |
for p in papers:
|
|
|
|
| 1353 |
if not num_papers:
|
| 1354 |
num_papers = self.DEFAULTS['num_papers']
|
| 1355 |
# arxiv api relevance search and data preparation
|
| 1356 |
+
self.print_fn("\n-searching arXiv for top 100 papers.. ")
|
| 1357 |
results, searched_papers = self.search(query, max_search=max_search)
|
| 1358 |
joblib.dump(searched_papers, self.dump_dir + 'papers_metadata.dmp')
|
| 1359 |
+
self.print_fn("\n-found " + str(len(searched_papers)) + " papers")
|
| 1360 |
|
| 1361 |
# paper selection by scibert vector embedding relevance scores
|
| 1362 |
# papers_selected = select_papers(searched_papers, query, num_papers=num_papers)
|
|
|
|
| 1369 |
|
| 1370 |
joblib.dump(papers_highlighted, self.dump_dir + 'papers_highlighted.dmp')
|
| 1371 |
|
| 1372 |
+
self.print_fn("\n-Standardizing known section headings per paper.. ")
|
| 1373 |
papers_standardized = self.standardize_headings(papers_highlighted)
|
| 1374 |
joblib.dump(papers_standardized, self.dump_dir + 'papers_standardized.dmp')
|
| 1375 |
|
| 1376 |
+
self.print_fn("\n-Building paper-wise corpus.. ")
|
| 1377 |
corpus = self.build_corpus(papers_highlighted, searched_papers)
|
| 1378 |
joblib.dump(corpus, self.dump_dir + 'corpus.dmp')
|
| 1379 |
|
| 1380 |
+
self.print_fn("\n-Building section-wise corpus.. ")
|
| 1381 |
corpus_sectionwise = self.build_corpus_sectionwise(papers_standardized)
|
| 1382 |
joblib.dump(corpus_sectionwise, self.dump_dir + 'corpus_sectionwise.dmp')
|
| 1383 |
|
| 1384 |
+
self.print_fn("\n-Building basic research highlights.. ")
|
| 1385 |
research_blocks = self.build_basic_blocks(corpus_sectionwise, corpus)
|
| 1386 |
joblib.dump(research_blocks, self.dump_dir + 'research_blocks.dmp')
|
| 1387 |
|
| 1388 |
+
self.print_fn("\n-Reducing corpus to lines.. ")
|
| 1389 |
corpus_lines = self.get_corpus_lines(corpus)
|
| 1390 |
joblib.dump(corpus_lines, self.dump_dir + 'corpus_lines.dmp')
|
| 1391 |
|
|
|
|
| 1419 |
'''
|
| 1420 |
# self.print_fn("corpus types:"+ str(np.unique([type(txt) for k,txt in corpus.items()])))
|
| 1421 |
|
| 1422 |
+
self.print_fn("\n-Building abstract.. ")
|
| 1423 |
abstract_block = self.get_abstract(corpus_lines, corpus_sectionwise, research_blocks)
|
| 1424 |
joblib.dump(abstract_block, self.dump_dir + 'abstract_block.dmp')
|
| 1425 |
'''
|
|
|
|
| 1428 |
self.print_fn(abstract_block)
|
| 1429 |
'''
|
| 1430 |
|
| 1431 |
+
self.print_fn("\n-Building introduction.. ")
|
| 1432 |
intro_block = self.get_intro(corpus_sectionwise, research_blocks)
|
| 1433 |
joblib.dump(intro_block, self.dump_dir + 'intro_block.dmp')
|
| 1434 |
'''
|
|
|
|
| 1436 |
self.print_fn("intro_block:")
|
| 1437 |
self.print_fn(intro_block)
|
| 1438 |
'''
|
| 1439 |
+
self.print_fn("\n-Building custom sections.. ")
|
| 1440 |
clustered_sections, clustered_sentences = self.get_clusters(papers_standardized, searched_papers)
|
| 1441 |
joblib.dump(clustered_sections, self.dump_dir + 'clustered_sections.dmp')
|
| 1442 |
joblib.dump(clustered_sentences, self.dump_dir + 'clustered_sentences.dmp')
|
|
|
|
| 1454 |
clustered_sections['introduction'] = intro_block
|
| 1455 |
joblib.dump(clustered_sections, self.dump_dir + 'research_sections.dmp')
|
| 1456 |
|
| 1457 |
+
self.print_fn("\n-Building conclusion.. ")
|
| 1458 |
conclusion_block = self.get_conclusion(clustered_sections)
|
| 1459 |
joblib.dump(conclusion_block, self.dump_dir + 'conclusion_block.dmp')
|
| 1460 |
clustered_sections['conclusion'] = conclusion_block
|
|
|
|
| 1471 |
shutil.copy(self.dump_dir + survey_file, survey_file)
|
| 1472 |
assert (os.path.exists(survey_file))
|
| 1473 |
output_zip = self.zip_outputs(self.dump_dir, query)
|
| 1474 |
+
self.print_fn("\n-Survey complete.. \nSurvey file path :" + os.path.abspath(
|
| 1475 |
survey_file) + "\nAll outputs zip path :" + os.path.abspath(self.dump_dir + output_zip))
|
| 1476 |
|
| 1477 |
return os.path.abspath(self.dump_dir + output_zip), os.path.abspath(survey_file)
|