File size: 5,202 Bytes
95a5062
1bba217
 
 
 
 
 
f586d1e
1bba217
 
 
 
 
 
f586d1e
c599bf4
1bba217
 
f586d1e
2bc03b3
1bba217
 
 
 
 
 
f586d1e
1bba217
 
 
f586d1e
1bba217
009dd9c
1bba217
 
 
 
2bc03b3
1bba217
2bc03b3
 
 
 
 
1bba217
2bc03b3
1bba217
2bc03b3
 
 
 
c599bf4
 
2bc03b3
 
ee033d3
1bba217
 
2bc03b3
50ef89f
 
9519a3b
95a5062
2bc03b3
50ef89f
009dd9c
c599bf4
009dd9c
2bc03b3
ee033d3
1bba217
 
098ee83
 
c599bf4
95a5062
1bba217
c599bf4
1bba217
 
 
 
0c734ac
1bba217
 
 
 
0c734ac
 
1bba217
c599bf4
1bba217
 
 
 
 
 
 
 
 
098ee83
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# This file converts the images into text
import pandas as pd
import numpy as np
import fitz
import re
import cv2


def paragraphs(folderpath):
    doc = fitz.open(folderpath + '/opinion.pdf')
    df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
    indices = list(df.index)
    pg_indices = df['Pg Ind'].tolist()

    x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
    paras = []
    for (i, pg_ind) in enumerate(pg_indices):
        lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
        pg_x1s, pg_x2s = [], []
        for (j, n) in enumerate(lines):
            x1s.append(n[0])
            y1s.append(n[1])
            x2s.append(n[2])
            y2s.append(n[3])
            line_texts.append(n[4])
            pg_x1s.append(n[0])
            pg_x2s.append(n[2])
            pg_inds.append(i)
            line_inds.append(j)
        baselines[i] = min(pg_x1s)
        rights[i] = max(pg_x2s)

    is_inblock = False
    for (j, line_text) in enumerate(line_texts):
        if j == 0:
            para = []
            continue

        if len(line_texts[j]) > 0:
            prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
            current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2

            prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None  # Include colon?
            prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
            current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
            prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
            current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
            prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None

            current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
            prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
            prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 14
            current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
            prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
            prior_right_margin = x1s[j - 1] > prior_median
            current_right_margin = x1s[j] > prior_median

            is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
            is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed)  # Note: Supertabbing oofs stuff
            is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
            is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
            is_after_disposition = (prior_right_margin and current_tabbed)
            is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower() or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower()))

            if is_start_blockquote and not is_section_header:
                is_inblock = True
            if is_after_blockquote or prior_is_date:  # This date hack helps ensure that slip opinion headers do not get caught
                is_inblock = False

            if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
                paras.append(para)
                para = []
            #     print('\n')
            # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
            para.append((pg_inds[j], line_inds[j], is_inblock, line_text))

    paras.append(para)
    paras_df = pd.DataFrame({'Lines': paras})
    return paras_df

def process_file(folderpath):
    paras_df = paragraphs(folderpath)
    paras_df.to_csv(folderpath + '/paragraphs.csv', index=False)
    data_df = pd.read_csv(folderpath + '/data.csv')
    paras_lines = paras_df['Lines'].tolist()
    indents = []
    for (i, para_lines) in enumerate(paras_lines):
        if para_lines == []:
            continue
        para = []
        para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
        page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
        pg_lines = eval(page_df['Lines'].tolist()[0])
        x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
        indents.append((x1, y1, x2, y2, para_first_line, para_start_pg_ind))

    for indent in indents:
        x1, y1, x2, y2, para_first_line, pg_ind = indent
        image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
        cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
        cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)