Upload 4 files
Browse files- ImageProcessor.py +148 -0
- Manager.py +10 -0
- Parser.py +287 -0
- Scraper.py +40 -0
ImageProcessor.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import fitz
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import pytesseract
|
| 7 |
+
import warnings
|
| 8 |
+
|
| 9 |
+
def pdf2png(folderpath):
|
| 10 |
+
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 11 |
+
zoom = 1
|
| 12 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 13 |
+
for (i, p) in enumerate(doc):
|
| 14 |
+
pix = p.get_pixmap(matrix=mat)
|
| 15 |
+
pix.save(folderpath + '/' + str(i) + '.png')
|
| 16 |
+
|
| 17 |
+
def get_footnote_bbox(filename):
|
| 18 |
+
footnotes_bbox = (None, None, None, None)
|
| 19 |
+
x1p, y1p, x2p, y2p = get_page_bbox(filename)
|
| 20 |
+
x1h, y1h, x2h, y2h = get_header_bbox(filename)
|
| 21 |
+
image = cv2.imread(filename)
|
| 22 |
+
im_h, im_w, im_d = image.shape
|
| 23 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 24 |
+
thresh = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)[1]
|
| 25 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
|
| 26 |
+
dilate = cv2.dilate(thresh, kernel, iterations=1)
|
| 27 |
+
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 28 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
| 29 |
+
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
|
| 30 |
+
for (i, c) in enumerate(cnts):
|
| 31 |
+
x, y, w, h = cv2.boundingRect(c)
|
| 32 |
+
if h < 7 and w > 50 and y > y1p and x - x1p < 30:
|
| 33 |
+
footnotes_bbox = (x, y, x2p, y2p)
|
| 34 |
+
return footnotes_bbox
|
| 35 |
+
|
| 36 |
+
def get_header_bbox(filename):
|
| 37 |
+
image = cv2.imread(filename)
|
| 38 |
+
im_h, im_w, im_d = image.shape
|
| 39 |
+
base_image = image.copy()
|
| 40 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 41 |
+
blur = cv2.GaussianBlur(gray, (9,9), 0)
|
| 42 |
+
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
| 43 |
+
|
| 44 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (200,10))
|
| 45 |
+
dilate = cv2.dilate(thresh, kernel, iterations=1)
|
| 46 |
+
|
| 47 |
+
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 48 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
| 49 |
+
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
|
| 50 |
+
for (i,c) in enumerate(cnts):
|
| 51 |
+
x,y,w,h = cv2.boundingRect(c)
|
| 52 |
+
break
|
| 53 |
+
header_bbox = (x, y, x+w, y+40)
|
| 54 |
+
# header_bbox = (145, 45, 465, 155) # For digitized variants
|
| 55 |
+
return header_bbox
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def get_page_bbox(filename):
|
| 59 |
+
image = cv2.imread(filename)
|
| 60 |
+
im_h, im_w, im_d = image.shape
|
| 61 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 62 |
+
blur = cv2.GaussianBlur(gray, (7, 7), 0)
|
| 63 |
+
thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
| 64 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 10))
|
| 65 |
+
dilate = cv2.dilate(thresh, kernel, iterations=1)
|
| 66 |
+
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 67 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
| 68 |
+
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
|
| 69 |
+
|
| 70 |
+
header_bbox = get_header_bbox(filename)
|
| 71 |
+
all_x1 = [cv2.boundingRect(c)[0] for c in cnts]
|
| 72 |
+
all_y1 = [cv2.boundingRect(c)[1] for c in cnts]
|
| 73 |
+
all_x2 = [cv2.boundingRect(c)[0] + cv2.boundingRect(c)[2] for c in cnts]
|
| 74 |
+
all_y2 = [cv2.boundingRect(c)[1] + cv2.boundingRect(c)[3] for c in cnts]
|
| 75 |
+
return min(all_x1), header_bbox[1], max(all_x2), max(all_y2)
|
| 76 |
+
|
| 77 |
+
def get_case_separator(filename):
|
| 78 |
+
new_case_line = (None, None, None, None)
|
| 79 |
+
x1p, y1p, x2p, y2p = get_page_bbox(filename)
|
| 80 |
+
x1h, y1h, x2h, y2h = get_header_bbox(filename)
|
| 81 |
+
|
| 82 |
+
image = cv2.imread(filename)
|
| 83 |
+
im_h, im_w, im_d = image.shape
|
| 84 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 85 |
+
blur = cv2.GaussianBlur(gray, (7, 7), 0)
|
| 86 |
+
thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV)[1]
|
| 87 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
|
| 88 |
+
dilate = cv2.dilate(thresh, kernel, iterations=1)
|
| 89 |
+
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 90 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
| 91 |
+
cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
|
| 92 |
+
for (i, c) in enumerate(cnts):
|
| 93 |
+
x, y, w, h = cv2.boundingRect(c)
|
| 94 |
+
x_center = (x1p + x2p) / 2
|
| 95 |
+
if h < 8 and w > 70 and ((x - x1p) < x_center and (x - x1p) > 0.3 * x_center) and (y > y1p and y > y1h): #
|
| 96 |
+
new_case_line = (x1p, y, x2p, y)
|
| 97 |
+
break
|
| 98 |
+
return new_case_line
|
| 99 |
+
|
| 100 |
+
def get_page_elements(filename):
|
| 101 |
+
page_bbox = get_page_bbox(filename)
|
| 102 |
+
header_bbox = get_header_bbox(filename)
|
| 103 |
+
fn_bbox = get_footnote_bbox(filename)
|
| 104 |
+
case_separator_bbox = get_case_separator(filename)
|
| 105 |
+
if fn_bbox[0] is not None:
|
| 106 |
+
body_bbox = (page_bbox[0], header_bbox[3], page_bbox[2], fn_bbox[1])
|
| 107 |
+
else:
|
| 108 |
+
body_bbox = (page_bbox[0], header_bbox[3], page_bbox[2], page_bbox[3])
|
| 109 |
+
|
| 110 |
+
image = cv2.imread(filename)
|
| 111 |
+
cv2.rectangle(image, (page_bbox[0], page_bbox[1]), (page_bbox[2], page_bbox[3]), (0, 0, 0), 4)
|
| 112 |
+
cv2.rectangle(image, (header_bbox[0], header_bbox[1]), (header_bbox[2], header_bbox[3]), (0, 255, 0), 2)
|
| 113 |
+
cv2.rectangle(image, (body_bbox[0], body_bbox[1]), (body_bbox[2], body_bbox[3]), (255, 0, 0), 2)
|
| 114 |
+
if fn_bbox[0] is not None:
|
| 115 |
+
cv2.rectangle(image, (fn_bbox[0], fn_bbox[1]), (fn_bbox[2], fn_bbox[3]), (0, 0, 255), 2)
|
| 116 |
+
if case_separator_bbox[0] is not None:
|
| 117 |
+
cv2.rectangle(image, (case_separator_bbox[0], case_separator_bbox[1]),
|
| 118 |
+
(case_separator_bbox[2], case_separator_bbox[3]), (255, 0, 255), 2)
|
| 119 |
+
|
| 120 |
+
return page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, image
|
| 121 |
+
|
| 122 |
+
def process_file(folderpath):
|
| 123 |
+
pdf2png(folderpath)
|
| 124 |
+
files = [f for f in os.listdir(folderpath) if '.png' in f.lower() and "processed" not in f.lower()]
|
| 125 |
+
data = {'Pg Ind':[],
|
| 126 |
+
'Header X1':[], 'Header Y1': [], 'Header X2': [], 'Header Y2':[],
|
| 127 |
+
'Body X1':[], 'Body Y1': [], 'Body X2': [], 'Body Y2':[],
|
| 128 |
+
'Footer X1':[], 'Footer Y1': [], 'Footer X2': [], 'Footer Y2':[],
|
| 129 |
+
'Page X1':[], 'Page Y1': [], 'Page X2': [], 'Page Y2':[],
|
| 130 |
+
'Case Separator Y': [],
|
| 131 |
+
}
|
| 132 |
+
data_df = pd.DataFrame(data)
|
| 133 |
+
for (i,f) in enumerate(files):
|
| 134 |
+
page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, image = get_page_elements(folderpath +'/' + f)
|
| 135 |
+
ind = int(f.split('.png')[0])
|
| 136 |
+
row = {'Pg Ind':[ind],
|
| 137 |
+
'Header X1':[header_bbox[0]], 'Header Y1': [header_bbox[1]], 'Header X2': [header_bbox[2]], 'Header Y2':[header_bbox[3]],
|
| 138 |
+
'Body X1':[body_bbox[0]], 'Body Y1': [body_bbox[1]], 'Body X2': [body_bbox[2]], 'Body Y2':[body_bbox[3]],
|
| 139 |
+
'Footer X1':[fn_bbox[0]], 'Footer Y1': [fn_bbox[1]], 'Footer X2': [fn_bbox[2]], 'Footer Y2':[fn_bbox[3]],
|
| 140 |
+
'Page X1':[page_bbox[0]], 'Page Y1': [page_bbox[1]], 'Page X2': [page_bbox[2]], 'Page Y2':[page_bbox[3]],
|
| 141 |
+
'Case Separator Y': [case_separator_bbox[1]]
|
| 142 |
+
}
|
| 143 |
+
row_df = pd.DataFrame(row)
|
| 144 |
+
data_df = pd.concat([data_df, row_df], ignore_index=True)
|
| 145 |
+
cv2.imwrite(folderpath + '/' + str(ind) + '-processed.png', image)
|
| 146 |
+
data_df['Pg Ind'] = data_df['Pg Ind'].astype('int')
|
| 147 |
+
data_df.to_csv(folderpath +'/data.csv', index=False)
|
| 148 |
+
|
Manager.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import Parser
|
| 2 |
+
import ImageProcessor
|
| 3 |
+
import pickle
|
| 4 |
+
|
| 5 |
+
def run(foldername):
|
| 6 |
+
ImageProcessor.process_file('PDF Cases/' + foldername)
|
| 7 |
+
C = Parser.run("PDF Cases/" + foldername)
|
| 8 |
+
with open('PDF Cases/' + foldername + '/processed.pkl', 'wb') as outp:
|
| 9 |
+
pickle.dump(C, outp, pickle.HIGHEST_PROTOCOL)
|
| 10 |
+
|
Parser.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import re
|
| 6 |
+
import datetime
|
| 7 |
+
import pytesseract
|
| 8 |
+
import cv2
|
| 9 |
+
import warnings
|
| 10 |
+
import ocrmypdf
|
| 11 |
+
import spacy
|
| 12 |
+
import dateparser
|
| 13 |
+
import statistics
|
| 14 |
+
from statistics import mode
|
| 15 |
+
from textblob import Word
|
| 16 |
+
from Levenshtein import distance
|
| 17 |
+
|
| 18 |
+
nlp = spacy.load('en_core_web_trf')
|
| 19 |
+
|
| 20 |
+
def parse_doc(folderpath):
|
| 21 |
+
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 22 |
+
header_texts, body_texts, footer_texts = [], [], []
|
| 23 |
+
paginated_dict = {}
|
| 24 |
+
for (i, p) in enumerate(doc):
|
| 25 |
+
ht, bt, ft = parse_page(folderpath, i)
|
| 26 |
+
if "preliminary print" in ht.lower(): # Skip cover page
|
| 27 |
+
continue
|
| 28 |
+
body_texts.append(bt)
|
| 29 |
+
header_texts.append(ht)
|
| 30 |
+
footer_texts.append(ft)
|
| 31 |
+
paginated_dict[i] = (ht, bt, ft)
|
| 32 |
+
return header_texts, body_texts, footer_texts, paginated_dict
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def parse_page(folderpath, pg_ind):
|
| 36 |
+
df = pd.read_csv(folderpath + '/data.csv')
|
| 37 |
+
|
| 38 |
+
header_text, body_text, footer_text = None, None, None
|
| 39 |
+
page_df = df[df['Pg Ind'] == pg_ind].to_dict('records')[0]
|
| 40 |
+
header = [page_df['Header X1'], page_df['Header Y1'], page_df['Header X2'], page_df['Header Y2']]
|
| 41 |
+
body = [page_df['Body X1'], page_df['Body Y1'], page_df['Body X2'], page_df['Body Y2']]
|
| 42 |
+
footer = [page_df['Footer X1'], page_df['Footer Y1'], page_df['Footer X2'], page_df['Footer Y2']]
|
| 43 |
+
case_split = page_df['Case Separator Y']
|
| 44 |
+
body_rect = fitz.Rect(body[0], body[1], body[2], body[3])
|
| 45 |
+
header_rect = fitz.Rect(header[0], header[1], header[2], header[3])
|
| 46 |
+
footer_rect = fitz.Rect(footer[0], footer[1], footer[2], footer[3])
|
| 47 |
+
|
| 48 |
+
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 49 |
+
page = doc[pg_ind]
|
| 50 |
+
header_text = page.get_text("text", clip=header_rect).strip().replace('Page Proof Pending Publication', '')
|
| 51 |
+
body_text = page.get_text("text", clip=body_rect).strip().replace('Page Proof Pending Publication', '')
|
| 52 |
+
if str(footer_rect[0]) != "nan":
|
| 53 |
+
footer_text = page.get_text("text", clip=footer_rect).strip().replace('Page Proof Pending Publication', '')
|
| 54 |
+
return header_text, body_text, footer_text
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_splits(folderpath):
|
| 58 |
+
header_texts, body_texts, footer_texts, paginated_dict = parse_doc(folderpath)
|
| 59 |
+
full_body_text = "\n".join(body_texts).replace('-', '')
|
| 60 |
+
full_body_text = correct(full_body_text, "justice")
|
| 61 |
+
|
| 62 |
+
split_p = re.compile(
|
| 63 |
+
'((\n|^)\s*Per Curiam\.\s*\n)|((\n|^)\s*(Mr\.\s*)?Justice[A-z\s\n,]*delivered the opinion)|((\n|^)\s*(mr\.\s*)?justice[A-Za-z\n\s,–-]*(concurring|dissenting)[A-Za-z\n\s,–]*\.)',
|
| 64 |
+
re.IGNORECASE)
|
| 65 |
+
splits_m = list(re.finditer(split_p, full_body_text))
|
| 66 |
+
splits = []
|
| 67 |
+
|
| 68 |
+
if len(splits_m) > 0:
|
| 69 |
+
print("---Found split---")
|
| 70 |
+
i = 0
|
| 71 |
+
while i <= len(splits_m):
|
| 72 |
+
if i == 0:
|
| 73 |
+
start = 0
|
| 74 |
+
else:
|
| 75 |
+
start = splits_m[i - 1].span()[0]
|
| 76 |
+
if i == len(splits_m):
|
| 77 |
+
splits.append(full_body_text[start:].strip())
|
| 78 |
+
else:
|
| 79 |
+
splits.append(full_body_text[start:splits_m[i].span()[0]].strip())
|
| 80 |
+
i = i + 1
|
| 81 |
+
return splits, paginated_dict
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_split_data(split):
|
| 85 |
+
txt = split[0:300]
|
| 86 |
+
d = nlp(txt)
|
| 87 |
+
first_sent = list(d.sents)[0]
|
| 88 |
+
first_sent_text = " ".join([t.text for t in first_sent])
|
| 89 |
+
ents = nlp(first_sent_text).ents
|
| 90 |
+
person_ents = [e.text.lower().split('tice')[-1].strip().capitalize() for e in ents if e.label_ == "PERSON"]
|
| 91 |
+
if "the chief justice" in first_sent_text:
|
| 92 |
+
person_ents.append("Chief")
|
| 93 |
+
opinion_type, author, joining = None, None, []
|
| 94 |
+
if "delivered" in first_sent_text:
|
| 95 |
+
author = person_ents[0]
|
| 96 |
+
joining = []
|
| 97 |
+
opinion_type = "majority"
|
| 98 |
+
if "per curiam" in first_sent_text.lower():
|
| 99 |
+
author = "Per Curiam"
|
| 100 |
+
joining = []
|
| 101 |
+
opinion_type = "majority"
|
| 102 |
+
if "concurring" in first_sent_text:
|
| 103 |
+
author = person_ents[0]
|
| 104 |
+
joining = person_ents[1:]
|
| 105 |
+
opinion_type = "concurrence"
|
| 106 |
+
if "dissenting" in first_sent_text:
|
| 107 |
+
author = person_ents[0]
|
| 108 |
+
joining = person_ents[1:]
|
| 109 |
+
opinion_type = "dissent"
|
| 110 |
+
if opinion_type == None:
|
| 111 |
+
opinion_type = "pre"
|
| 112 |
+
return opinion_type, author, joining
|
| 113 |
+
|
| 114 |
+
def court_from_year(date_time):
|
| 115 |
+
df = pd.read_csv('Justices Table.csv')
|
| 116 |
+
justice_dict = df.to_dict('records')
|
| 117 |
+
court_year = {'Associate':[], 'Chief': None}
|
| 118 |
+
for j in justice_dict:
|
| 119 |
+
start = datetime.datetime.strptime(j['Start'], '%Y-%m-%d')
|
| 120 |
+
if str(j['End']) != "nan":
|
| 121 |
+
end = datetime.datetime.strptime(j['End'], '%Y-%m-%d')
|
| 122 |
+
else:
|
| 123 |
+
end = datetime.datetime.now()
|
| 124 |
+
if date_time > start and date_time < end:
|
| 125 |
+
name = j['Name']
|
| 126 |
+
if "Associate" in name:
|
| 127 |
+
court_year['Associate'].append(name.split('(Associate Justice)')[0].split(', ')[0].strip().split(' ')[-1])
|
| 128 |
+
if "Chief" in name:
|
| 129 |
+
court_year['Chief'] = name.split('(Chief Justice)')[0].split(', ')[0].strip()
|
| 130 |
+
return court_year
|
| 131 |
+
|
| 132 |
+
def correct(corpus, keyword):
|
| 133 |
+
words = corpus.split(' ')
|
| 134 |
+
potential_targets = []
|
| 135 |
+
for (i, w) in enumerate(words):
|
| 136 |
+
d = distance(keyword, w.lower())
|
| 137 |
+
if d < 2 and d > 0:
|
| 138 |
+
potential_targets.append((i, w))
|
| 139 |
+
|
| 140 |
+
for (ind, pt) in potential_targets:
|
| 141 |
+
word = Word(pt.lower())
|
| 142 |
+
result = word.spellcheck()
|
| 143 |
+
if result[0][1] > 0.9 and result[0][0].lower() != pt.lower():
|
| 144 |
+
if "\n" in pt:
|
| 145 |
+
words[ind] = "\n" + result[0][0]
|
| 146 |
+
else:
|
| 147 |
+
words[ind] = result[0][0]
|
| 148 |
+
return " ".join(words)
|
| 149 |
+
|
| 150 |
+
class Opinion:
|
| 151 |
+
def __init__(self, opinion_type, author, joining, body_text, fn_text, header_text):
|
| 152 |
+
self.opinion_type = opinion_type
|
| 153 |
+
self.author = author
|
| 154 |
+
self.joining = joining
|
| 155 |
+
self.body_text = body_text
|
| 156 |
+
self.fn_text = fn_text
|
| 157 |
+
self.header_text = header_text
|
| 158 |
+
|
| 159 |
+
class Case:
|
| 160 |
+
def __init__(self, paginated_dict):
|
| 161 |
+
self.paginated_dict = paginated_dict
|
| 162 |
+
self.majority, self.concurrences, self.dissents, self.pre = None, [], [], None
|
| 163 |
+
self.date, self.case_name, self.case_citation, self.page_numbers = None, "", None, []
|
| 164 |
+
self.recused = []
|
| 165 |
+
self.cert_info = None
|
| 166 |
+
|
| 167 |
+
def get_date(self):
|
| 168 |
+
print("Extracting Date")
|
| 169 |
+
doc = nlp(self.pre.body_text[0:2000])
|
| 170 |
+
sents = list(doc.sents)
|
| 171 |
+
for s in sents:
|
| 172 |
+
if "Decided" in s.text:
|
| 173 |
+
date_extract = s.text.replace('\n', '').split('Decided')[-1].strip().replace('.', '')
|
| 174 |
+
pattern = re.compile('Decided\s*\w*\s*[0-9]{1,2}, [0-9]{4}')
|
| 175 |
+
match = re.search(pattern, s.text)
|
| 176 |
+
date_extract = s.text[match.span()[0]:match.span()[1]].split('Decided')[-1].strip()
|
| 177 |
+
date = datetime.datetime.strptime(date_extract, '%B %d, %Y')
|
| 178 |
+
self.date = date
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
def update_recused(self):
|
| 182 |
+
print("Identifying recused")
|
| 183 |
+
p = re.compile('(?:justice )[\w\s]*(?: took no part)', re.IGNORECASE)
|
| 184 |
+
m = re.search(p, self.majority.body_text)
|
| 185 |
+
if m is not None:
|
| 186 |
+
recused_span = self.majority.body_text[m.span()[0]:m.span()[1]].lower()
|
| 187 |
+
doc = nlp(recused_span)
|
| 188 |
+
self.recused = [e.text.split('justice')[-1].upper().strip().capitalize() for e in doc.ents if
|
| 189 |
+
e.label_ == "PERSON"]
|
| 190 |
+
if "chief justice" in recused_span:
|
| 191 |
+
self.recused.append("Chief")
|
| 192 |
+
|
| 193 |
+
def update_majority_joining(self):
|
| 194 |
+
print("Getting updated list")
|
| 195 |
+
cy = court_from_year(self.date)
|
| 196 |
+
known = [j for d in self.dissents for j in d.joining] + [d.author for d in self.dissents] + [j for c in
|
| 197 |
+
self.concurrences
|
| 198 |
+
for j in
|
| 199 |
+
c.joining] + [
|
| 200 |
+
c.author for c in self.concurrences] + [self.majority.author] + [r for r in self.recused]
|
| 201 |
+
all_justices = [aj for aj in cy['Associate']]
|
| 202 |
+
if cy['Chief'] is not None:
|
| 203 |
+
all_justices.append('Chief')
|
| 204 |
+
self.majority.joining = [aj for aj in all_justices if aj not in known]
|
| 205 |
+
|
| 206 |
+
def get_cert_info(self):
|
| 207 |
+
print("Extracting Cert Info")
|
| 208 |
+
lines = self.pre.body_text.split('\n')
|
| 209 |
+
start = -1
|
| 210 |
+
end = -1
|
| 211 |
+
for (i, l) in enumerate(lines):
|
| 212 |
+
if "petition" in l.lower() or "cert" in l.lower() or "appeals" in l.lower() or "on" in l.lower().split(' '):
|
| 213 |
+
start = i
|
| 214 |
+
if "no." in l.lower() or "no.s" in l.lower() or "argued" in l.lower() or "decided" in l.lower():
|
| 215 |
+
end = i
|
| 216 |
+
break
|
| 217 |
+
self.cert_info = " ".join(lines[start:end]).strip().upper().replace(' ', ' ').replace('.', '')
|
| 218 |
+
|
| 219 |
+
def get_case_name_cite_pns(self):
|
| 220 |
+
lines_total = [l for p in self.paginated_dict for l in self.paginated_dict[p][0].split('\n')[:-1]]
|
| 221 |
+
lines_selected = []
|
| 222 |
+
p = re.compile('(october|per curiam|opinion of|concur|dissent|statement of|argument|syllabus|[0-9] ?U.)', re.IGNORECASE)
|
| 223 |
+
for l in lines_total:
|
| 224 |
+
m = re.search(p, l)
|
| 225 |
+
if m is None and not l.lower().strip().isnumeric():
|
| 226 |
+
lines_selected.append(l)
|
| 227 |
+
self.case_name = mode(lines_selected)
|
| 228 |
+
|
| 229 |
+
p = re.compile('[0-9]*\s?U\.\s?S\. ?([0-9]|_)*', re.IGNORECASE)
|
| 230 |
+
lines_selected = []
|
| 231 |
+
for l in lines_total:
|
| 232 |
+
m = re.search(p, l)
|
| 233 |
+
if m is not None:
|
| 234 |
+
self.case_citation = l[m.span()[0]:m.span()[1]]
|
| 235 |
+
break
|
| 236 |
+
|
| 237 |
+
p = re.compile('^\s?[0-9]+\s?$', re.IGNORECASE)
|
| 238 |
+
page_lines = [self.paginated_dict[p][0].split('\n') for p in self.paginated_dict]
|
| 239 |
+
self.page_numbers = []
|
| 240 |
+
for pl in page_lines:
|
| 241 |
+
numeric_on_page = []
|
| 242 |
+
for l in pl:
|
| 243 |
+
matches = list(re.finditer(p, l))
|
| 244 |
+
for m in matches:
|
| 245 |
+
possibility = int(l[m.span()[0]:m.span()[1]].strip())
|
| 246 |
+
numeric_on_page.append(possibility)
|
| 247 |
+
if len(numeric_on_page) == 0:
|
| 248 |
+
if len(self.page_numbers) > 0:
|
| 249 |
+
self.page_numbers.append(self.page_numbers[-1] + 1)
|
| 250 |
+
else:
|
| 251 |
+
self.page_numbers.append(1)
|
| 252 |
+
if len(numeric_on_page) > 0:
|
| 253 |
+
page_number = max(numeric_on_page)
|
| 254 |
+
if len(self.page_numbers) > 0:
|
| 255 |
+
page_number = max(page_number, self.page_numbers[-1] + 1)
|
| 256 |
+
self.page_numbers.append(page_number)
|
| 257 |
+
|
| 258 |
+
if self.case_citation is not None and self.case_citation.lower().split('s.')[-1].strip() == "":
|
| 259 |
+
self.case_citation = self.case_citation.strip() + ' ' + str(self.page_numbers[0])
|
| 260 |
+
|
| 261 |
+
def process(self):
|
| 262 |
+
self.get_date()
|
| 263 |
+
self.update_recused()
|
| 264 |
+
self.update_majority_joining()
|
| 265 |
+
self.get_cert_info()
|
| 266 |
+
self.get_case_name_cite_pns()
|
| 267 |
+
|
| 268 |
+
def run(folderpath):
|
| 269 |
+
splits, paginated_dict = get_splits(folderpath)
|
| 270 |
+
C = Case(paginated_dict=paginated_dict)
|
| 271 |
+
ops = []
|
| 272 |
+
for s in splits:
|
| 273 |
+
opinion_type, author, joining = get_split_data(s)
|
| 274 |
+
if opinion_type is not None:
|
| 275 |
+
op = Opinion(opinion_type, author, joining, s, fn_text=None, header_text=None)
|
| 276 |
+
if opinion_type == "majority":
|
| 277 |
+
C.majority = op
|
| 278 |
+
if opinion_type == "concurrence":
|
| 279 |
+
C.concurrences.append(op)
|
| 280 |
+
if opinion_type == "dissent":
|
| 281 |
+
C.dissents.append(op)
|
| 282 |
+
if opinion_type == "pre":
|
| 283 |
+
C.pre = op
|
| 284 |
+
ops.append(op)
|
| 285 |
+
|
| 286 |
+
C.process()
|
| 287 |
+
return C
|
Scraper.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from bs4 import BeautifulSoup
|
| 2 |
+
import re
|
| 3 |
+
import requests
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
def download_slip(link):
|
| 7 |
+
r = requests.get("https://www.supremecourt.gov" + link, stream=True)
|
| 8 |
+
base = link.split('/')[-1].split('.pdf')[0]
|
| 9 |
+
if not os.path.isdir('PDF Cases/' + base):
|
| 10 |
+
os.mkdir('PDF Cases/' + base)
|
| 11 |
+
name = 'PDF Cases/' + base + '/' + "opinion.pdf"
|
| 12 |
+
with open(name, 'wb') as f:
|
| 13 |
+
for chunk in r.iter_content(chunk_size=1024):
|
| 14 |
+
f.write(chunk)
|
| 15 |
+
|
| 16 |
+
def download_loc(link):
|
| 17 |
+
base = link.split('/')[-1].split('.pdf')[0]
|
| 18 |
+
volume = int(base.split('usrep')[-1][0:3])
|
| 19 |
+
page = int(base.split('usrep')[-1][3:])
|
| 20 |
+
foldername = str(volume) + '_' + str(page)
|
| 21 |
+
r = requests.get(link, stream=True)
|
| 22 |
+
if not os.path.isdir('PDF Cases/' + foldername):
|
| 23 |
+
os.mkdir('PDF Cases/' + foldername)
|
| 24 |
+
name = 'PDF Cases/' + foldername + '/' + "opinion.pdf"
|
| 25 |
+
with open(name, 'wb') as f:
|
| 26 |
+
for chunk in r.iter_content(chunk_size=1024):
|
| 27 |
+
f.write(chunk)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def slip_pipeline(year):
|
| 31 |
+
page = requests.get("https://www.supremecourt.gov/opinions/slipopinion/" + str(year))
|
| 32 |
+
soup = BeautifulSoup(page.text)
|
| 33 |
+
html_links = soup.findAll('div', attrs={'id': 'accordion'})[0].findAll('a')
|
| 34 |
+
links = []
|
| 35 |
+
for link in html_links:
|
| 36 |
+
if ".pdf" in link.get('href').lower() and "new" not in link.get('href') and "diff" not in link.get('href'):
|
| 37 |
+
links.append(link.get('href'))
|
| 38 |
+
|
| 39 |
+
for l in links:
|
| 40 |
+
download_slip(l)
|