import os from whoosh import index import pymupdf from acronym_finder.acronym_finder_function import acronym_dict_generator from search_engine_functions import do from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME import re schema = Schema(title=ID(stored=True), date=DATETIME(stored=True), content=TEXT(stored=True), acronyms=KEYWORD(stored=True), file_name=ID(stored=True)) import re import datefinder acronym_regex = r"([A-Z][\w,’‘']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,’‘']+){1,})\s\(([A-Za-z\s]+)\)" def find_single_date(string): date_matches = datefinder.find_dates(string.strip()) first_match = next(date_matches, None) if first_match: date = first_match return date else: return "" def doc_processor(doc,file_name): blocks = doc[0].get_text( "dict", flags=1 + 2 + 8, sort=True, )["blocks"] if not blocks: return "","","" title = '' final_title = '' date = '' first_page_text = doc[0].get_text() title_match = re.search(r"(?<=What:).+?([A-Z].+?)(?=\nDate)",first_page_text,re.DOTALL) if title_match: final_title = title_match[1] for b in blocks: # iterate through the text blocks for l in b["lines"]: # iterate through the text lines for line_counter, s in enumerate( l["spans"] ): # iterate through the text spans # if len(s['text'].strip())==0: # continue if not final_title and not s["flags"] in [20,16]: title = title.strip() final_title = re.sub(r"\s{2,}"," ",title) if not date: date = find_single_date(s['text']) if s['bbox'][0]> 350: # Skip headers/watermarks continue if (s["flags"] in [20,16] or s['size'] > 25) and s['text'].strip() and re.search(r"[a-zA-Z]{3,}",s['text'].strip()) and not final_title: title += s['text'] if not date: # try yyyymm date_find = re.search(r"(2\d{3})(\d{2})",file_name) if date_find: date_reorder = date_find[1]+' '+date_find[2] date = find_single_date(date_reorder) # try ddmmyy date_find = re.search(r"\d{6}",file_name) if date_find: date_reorder = ' '.join([date_find[0][i:i+2] for i in range(0, 6,2)]) date = find_single_date(date_reorder) all_text = [] for page in doc: all_text += page.get_text() all_text = ''.join(all_text) all_text = re.sub(r"\n"," ",all_text) all_text = re.sub(r"\s{2,}"," ",all_text) return(date,final_title,all_text) if __name__ == "__main__": index_dir = 'index' if not os.path.exists(index_dir): os.mkdir(index_dir) ix = index.create_in(index_dir, schema) ix = index.open_dir("index") writer = ix.writer(procs=16,multisegment=True) doc_list = os.listdir('data/mopac_research') for file_name in doc_list: file = os.path.join('data/mopac_research',file_name) doc = pymupdf.open(os.path.join(os.path.join(file))) date,final_title,all_text = doc_processor(doc,file_name) if all_text: acronym_dict = acronym_dict_generator(all_text,acronym_regex=acronym_regex) keywords = ",".join(list(acronym_dict.keys())) writer.add_document(title=final_title, content=all_text, date=date if date else None, acronyms = keywords, file_name = file_name ) else: print('skipping file ', file_name, ' ,blank') writer.commit()