File size: 3,899 Bytes
efeacc7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
from whoosh import index
import pymupdf
from acronym_finder.acronym_finder_function import acronym_dict_generator
from search_engine_functions import do
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
import re
schema = Schema(title=ID(stored=True),
                date=DATETIME(stored=True),
                content=TEXT(stored=True),
                acronyms=KEYWORD(stored=True),
                file_name=ID(stored=True))

import re
import datefinder

acronym_regex = r"([A-Z][\w,’‘']+(?:(?:\s|&|and|or|the|of|to|in|on|at|for|an|-)+[A-Z][\w,’‘']+){1,})\s\(([A-Za-z\s]+)\)"

def find_single_date(string):
    date_matches = datefinder.find_dates(string.strip())
    first_match = next(date_matches, None)
    if first_match:
        date = first_match
        return date
    else:
        return ""

def doc_processor(doc,file_name):
    blocks = doc[0].get_text(
        "dict",
        flags=1 + 2 + 8,
        sort=True,
    )["blocks"]
    if not blocks:
        return "","",""
    title = ''
    final_title = ''
    date = ''
    first_page_text = doc[0].get_text()
    title_match = re.search(r"(?<=What:).+?([A-Z].+?)(?=\nDate)",first_page_text,re.DOTALL)
    if title_match:
        final_title = title_match[1]
    for b in blocks:  # iterate through the text blocks
        for l in b["lines"]:  # iterate through the text lines

            for line_counter, s in enumerate(
                l["spans"]
            ):  # iterate through the text spans
                # if len(s['text'].strip())==0:
                #     continue
                if not final_title and not s["flags"] in [20,16]:
                    title = title.strip()
                    final_title = re.sub(r"\s{2,}"," ",title) 

                if not date:
                    date = find_single_date(s['text'])

                if s['bbox'][0]> 350:
                    # Skip headers/watermarks
                    continue
                if (s["flags"] in [20,16] or s['size'] > 25) and s['text'].strip() and re.search(r"[a-zA-Z]{3,}",s['text'].strip()) and not final_title:
                    title += s['text']
    if not date:
        # try yyyymm
        date_find = re.search(r"(2\d{3})(\d{2})",file_name)
        if date_find:
            date_reorder = date_find[1]+' '+date_find[2]
            date = find_single_date(date_reorder)
        # try ddmmyy
        date_find = re.search(r"\d{6}",file_name)
        if date_find:
            date_reorder = ' '.join([date_find[0][i:i+2] for i in range(0, 6,2)])
            date = find_single_date(date_reorder)  
    all_text = []
    for page in doc:
        all_text += page.get_text()
    all_text = ''.join(all_text)
    all_text = re.sub(r"\n"," ",all_text)
    all_text = re.sub(r"\s{2,}"," ",all_text)
    return(date,final_title,all_text)



if __name__ == "__main__":
    index_dir = 'index'
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)

    ix = index.create_in(index_dir, schema)

    ix = index.open_dir("index")
    writer = ix.writer(procs=16,multisegment=True)

    doc_list = os.listdir('data/mopac_research')
    for file_name in doc_list:
        file = os.path.join('data/mopac_research',file_name)
        doc = pymupdf.open(os.path.join(os.path.join(file)))
        date,final_title,all_text = doc_processor(doc,file_name)
        if all_text:
            acronym_dict = acronym_dict_generator(all_text,acronym_regex=acronym_regex)
            keywords = ",".join(list(acronym_dict.keys()))
            writer.add_document(title=final_title,
                                content=all_text,
                                date=date if date else None,
                                acronyms = keywords,
                                file_name = file_name
            )
        else:
            print('skipping file ', file_name, ' ,blank')

    writer.commit()