File size: 5,202 Bytes
95a5062 1bba217 f586d1e 1bba217 f586d1e c599bf4 1bba217 f586d1e 2bc03b3 1bba217 f586d1e 1bba217 f586d1e 1bba217 009dd9c 1bba217 2bc03b3 1bba217 2bc03b3 1bba217 2bc03b3 1bba217 2bc03b3 c599bf4 2bc03b3 ee033d3 1bba217 2bc03b3 50ef89f 9519a3b 95a5062 2bc03b3 50ef89f 009dd9c c599bf4 009dd9c 2bc03b3 ee033d3 1bba217 098ee83 c599bf4 95a5062 1bba217 c599bf4 1bba217 0c734ac 1bba217 0c734ac 1bba217 c599bf4 1bba217 098ee83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# This file converts the images into text
import pandas as pd
import numpy as np
import fitz
import re
import cv2
def paragraphs(folderpath):
doc = fitz.open(folderpath + '/opinion.pdf')
df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
indices = list(df.index)
pg_indices = df['Pg Ind'].tolist()
x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
paras = []
for (i, pg_ind) in enumerate(pg_indices):
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
pg_x1s, pg_x2s = [], []
for (j, n) in enumerate(lines):
x1s.append(n[0])
y1s.append(n[1])
x2s.append(n[2])
y2s.append(n[3])
line_texts.append(n[4])
pg_x1s.append(n[0])
pg_x2s.append(n[2])
pg_inds.append(i)
line_inds.append(j)
baselines[i] = min(pg_x1s)
rights[i] = max(pg_x2s)
is_inblock = False
for (j, line_text) in enumerate(line_texts):
if j == 0:
para = []
continue
if len(line_texts[j]) > 0:
prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None # Include colon?
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None
current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 14
current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
prior_right_margin = x1s[j - 1] > prior_median
current_right_margin = x1s[j] > prior_median
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
is_after_disposition = (prior_right_margin and current_tabbed)
is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower() or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower()))
if is_start_blockquote and not is_section_header:
is_inblock = True
if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
is_inblock = False
if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
paras.append(para)
para = []
# print('\n')
# print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
para.append((pg_inds[j], line_inds[j], is_inblock, line_text))
paras.append(para)
paras_df = pd.DataFrame({'Lines': paras})
return paras_df
def process_file(folderpath):
paras_df = paragraphs(folderpath)
paras_df.to_csv(folderpath + '/paragraphs.csv', index=False)
data_df = pd.read_csv(folderpath + '/data.csv')
paras_lines = paras_df['Lines'].tolist()
indents = []
for (i, para_lines) in enumerate(paras_lines):
if para_lines == []:
continue
para = []
para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
pg_lines = eval(page_df['Lines'].tolist()[0])
x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
indents.append((x1, y1, x2, y2, para_first_line, para_start_pg_ind))
for indent in indents:
x1, y1, x2, y2, para_first_line, pg_ind = indent
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image) |