denovoref
Add binary files from src/search_engine/index
efeacc7
import os
from whoosh import index
import pymupdf
from acronym_finder.acronym_finder_function import acronym_dict_generator
from search_engine_functions import do
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
import re
schema = Schema(title=ID(stored=True),
date=DATETIME(stored=True),
content=TEXT(stored=True),
acronyms=KEYWORD(stored=True),
file_name=ID(stored=True))
import re
import datefinder
acronym_regex = r"([A-Z][\w,β€™β€˜']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,β€™β€˜']+){1,})\s\(([A-Za-z\s]+)\)"
def find_single_date(string):
date_matches = datefinder.find_dates(string.strip())
first_match = next(date_matches, None)
if first_match:
date = first_match
return date
else:
return ""
def doc_processor(doc,file_name):
blocks = doc[0].get_text(
"dict",
flags=1 + 2 + 8,
sort=True,
)["blocks"]
if not blocks:
return "","",""
title = ''
final_title = ''
date = ''
first_page_text = doc[0].get_text()
title_match = re.search(r"(?<=What:).+?([A-Z].+?)(?=\nDate)",first_page_text,re.DOTALL)
if title_match:
final_title = title_match[1]
for b in blocks: # iterate through the text blocks
for l in b["lines"]: # iterate through the text lines
for line_counter, s in enumerate(
l["spans"]
): # iterate through the text spans
# if len(s['text'].strip())==0:
# continue
if not final_title and not s["flags"] in [20,16]:
title = title.strip()
final_title = re.sub(r"\s{2,}"," ",title)
if not date:
date = find_single_date(s['text'])
if s['bbox'][0]> 350:
# Skip headers/watermarks
continue
if (s["flags"] in [20,16] or s['size'] > 25) and s['text'].strip() and re.search(r"[a-zA-Z]{3,}",s['text'].strip()) and not final_title:
title += s['text']
if not date:
# try yyyymm
date_find = re.search(r"(2\d{3})(\d{2})",file_name)
if date_find:
date_reorder = date_find[1]+' '+date_find[2]
date = find_single_date(date_reorder)
# try ddmmyy
date_find = re.search(r"\d{6}",file_name)
if date_find:
date_reorder = ' '.join([date_find[0][i:i+2] for i in range(0, 6,2)])
date = find_single_date(date_reorder)
all_text = []
for page in doc:
all_text += page.get_text()
all_text = ''.join(all_text)
all_text = re.sub(r"\n"," ",all_text)
all_text = re.sub(r"\s{2,}"," ",all_text)
return(date,final_title,all_text)
if __name__ == "__main__":
index_dir = 'index'
if not os.path.exists(index_dir):
os.mkdir(index_dir)
ix = index.create_in(index_dir, schema)
ix = index.open_dir("index")
writer = ix.writer(procs=16,multisegment=True)
doc_list = os.listdir('data/mopac_research')
for file_name in doc_list:
file = os.path.join('data/mopac_research',file_name)
doc = pymupdf.open(os.path.join(os.path.join(file)))
date,final_title,all_text = doc_processor(doc,file_name)
if all_text:
acronym_dict = acronym_dict_generator(all_text,acronym_regex=acronym_regex)
keywords = ",".join(list(acronym_dict.keys()))
writer.add_document(title=final_title,
content=all_text,
date=date if date else None,
acronyms = keywords,
file_name = file_name
)
else:
print('skipping file ', file_name, ' ,blank')
writer.commit()