Spaces:
Sleeping
Sleeping
| import os | |
| from whoosh import index | |
| import pymupdf | |
| from acronym_finder.acronym_finder_function import acronym_dict_generator | |
| from search_engine_functions import do | |
| from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME | |
| import re | |
| schema = Schema(title=ID(stored=True), | |
| date=DATETIME(stored=True), | |
| content=TEXT(stored=True), | |
| acronyms=KEYWORD(stored=True), | |
| file_name=ID(stored=True)) | |
| import re | |
| import datefinder | |
| acronym_regex = r"([A-Z][\w,ββ']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,ββ']+){1,})\s\(([A-Za-z\s]+)\)" | |
| def find_single_date(string): | |
| date_matches = datefinder.find_dates(string.strip()) | |
| first_match = next(date_matches, None) | |
| if first_match: | |
| date = first_match | |
| return date | |
| else: | |
| return "" | |
| def doc_processor(doc,file_name): | |
| blocks = doc[0].get_text( | |
| "dict", | |
| flags=1 + 2 + 8, | |
| sort=True, | |
| )["blocks"] | |
| if not blocks: | |
| return "","","" | |
| title = '' | |
| final_title = '' | |
| date = '' | |
| first_page_text = doc[0].get_text() | |
| title_match = re.search(r"(?<=What:).+?([A-Z].+?)(?=\nDate)",first_page_text,re.DOTALL) | |
| if title_match: | |
| final_title = title_match[1] | |
| for b in blocks: # iterate through the text blocks | |
| for l in b["lines"]: # iterate through the text lines | |
| for line_counter, s in enumerate( | |
| l["spans"] | |
| ): # iterate through the text spans | |
| # if len(s['text'].strip())==0: | |
| # continue | |
| if not final_title and not s["flags"] in [20,16]: | |
| title = title.strip() | |
| final_title = re.sub(r"\s{2,}"," ",title) | |
| if not date: | |
| date = find_single_date(s['text']) | |
| if s['bbox'][0]> 350: | |
| # Skip headers/watermarks | |
| continue | |
| if (s["flags"] in [20,16] or s['size'] > 25) and s['text'].strip() and re.search(r"[a-zA-Z]{3,}",s['text'].strip()) and not final_title: | |
| title += s['text'] | |
| if not date: | |
| # try yyyymm | |
| date_find = re.search(r"(2\d{3})(\d{2})",file_name) | |
| if date_find: | |
| date_reorder = date_find[1]+' '+date_find[2] | |
| date = find_single_date(date_reorder) | |
| # try ddmmyy | |
| date_find = re.search(r"\d{6}",file_name) | |
| if date_find: | |
| date_reorder = ' '.join([date_find[0][i:i+2] for i in range(0, 6,2)]) | |
| date = find_single_date(date_reorder) | |
| all_text = [] | |
| for page in doc: | |
| all_text += page.get_text() | |
| all_text = ''.join(all_text) | |
| all_text = re.sub(r"\n"," ",all_text) | |
| all_text = re.sub(r"\s{2,}"," ",all_text) | |
| return(date,final_title,all_text) | |
| if __name__ == "__main__": | |
| index_dir = 'index' | |
| if not os.path.exists(index_dir): | |
| os.mkdir(index_dir) | |
| ix = index.create_in(index_dir, schema) | |
| ix = index.open_dir("index") | |
| writer = ix.writer(procs=16,multisegment=True) | |
| doc_list = os.listdir('data/mopac_research') | |
| for file_name in doc_list: | |
| file = os.path.join('data/mopac_research',file_name) | |
| doc = pymupdf.open(os.path.join(os.path.join(file))) | |
| date,final_title,all_text = doc_processor(doc,file_name) | |
| if all_text: | |
| acronym_dict = acronym_dict_generator(all_text,acronym_regex=acronym_regex) | |
| keywords = ",".join(list(acronym_dict.keys())) | |
| writer.add_document(title=final_title, | |
| content=all_text, | |
| date=date if date else None, | |
| acronyms = keywords, | |
| file_name = file_name | |
| ) | |
| else: | |
| print('skipping file ', file_name, ' ,blank') | |
| writer.commit() |