File size: 4,677 Bytes
1bba217
 
 
 
 
 
f586d1e
1bba217
 
 
 
 
 
f586d1e
1bba217
 
 
f586d1e
 
1bba217
 
 
 
 
 
f586d1e
1bba217
 
 
f586d1e
1bba217
 
 
 
 
 
 
9519a3b
 
 
 
f586d1e
1bba217
f586d1e
1bba217
f586d1e
1bba217
f586d1e
 
 
 
9519a3b
 
1bba217
 
9519a3b
 
 
 
 
1bba217
9519a3b
1bba217
 
 
9519a3b
1bba217
9519a3b
1bba217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9519a3b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
import numpy as np
import fitz
import re
import cv2


def paragraphs(folderpath):
    doc = fitz.open(folderpath + '/opinion.pdf')
    df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
    indices = list(df.index)
    pg_indices = df['Pg Ind'].tolist()

    x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
    paras = []
    for (i, pg_ind) in enumerate(pg_indices):
        lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
        pg_x1s, pg_x2s = [], []
        for (j, n) in enumerate(lines):
            x1s.append(n[0])
            y1s.append(n[1])
            x2s.append(n[2])
            y2s.append(n[3])
            line_texts.append(n[4])
            pg_x1s.append(n[0])
            pg_x2s.append(n[2])
            pg_inds.append(i)
            line_inds.append(j)
        baselines[i] = min(pg_x1s)
        rights[i] = max(pg_x2s)

    for (j, line_text) in enumerate(line_texts):
        if j == 0:
            para = []
            continue

        if len(line_texts[j]) > 0:
            prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
            current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2

            prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None  # Include colon?
            prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
            current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
            prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
            current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
            prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None

            current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
            prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
            prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 18
            current_supertabbed = x1s[j] - baselines[pg_inds[j]] > 18
            prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
            prior_right_margin = x1s[j - 1] > prior_median

            is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
            is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed)  # Note: Supertabbing oofs stuff
            is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
            is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed)
            is_after_disposition = (prior_right_margin and current_tabbed)
            is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed)

            if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
                paras.append(para)
                para = []
            #     print('\n')
            # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote)  + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + line_text)
            para.append((pg_inds[j], line_inds[j], line_text))

    paras.append(para)
    paras_df = pd.DataFrame({'Lines': paras})
    return paras_df

def process_file(folderpath):
    paras_df = paragraphs(folderpath)
    paras_df.to_csv(folderpath + '/paragraphs.csv', index=True)
    data_df = pd.read_csv(folderpath + '/data.csv')
    paras_lines = paras_df['Lines'].tolist()
    indents = []
    for (i, para_lines) in enumerate(paras_lines):
        para = []
        para_start_pg_ind, para_start_line_ind, para_first_line = para_lines[0]
        page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
        pg_lines = eval(page_df['Lines'].tolist()[0])
        x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
        indents.append((x1, y1, x2, y2, para_first_line, para_start_pg_ind))

    for indent in indents:
        x1, y1, x2, y2, para_first_line, pg_ind = indent
        image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
        cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
        cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)

process_file('PDF Cases/462_122')