Delete Parser.py
Browse files
Parser.py
DELETED
|
@@ -1,312 +0,0 @@
|
|
| 1 |
-
import fitz
|
| 2 |
-
import numpy as np
|
| 3 |
-
import os
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import re
|
| 6 |
-
import datetime
|
| 7 |
-
import pytesseract
|
| 8 |
-
import cv2
|
| 9 |
-
import warnings
|
| 10 |
-
import ocrmypdf
|
| 11 |
-
import spacy
|
| 12 |
-
import dateparser
|
| 13 |
-
import statistics
|
| 14 |
-
from statistics import mode
|
| 15 |
-
from textblob import Word
|
| 16 |
-
from Levenshtein import distance
|
| 17 |
-
|
| 18 |
-
nlp = spacy.load('en_core_web_trf')
|
| 19 |
-
|
| 20 |
-
def parse_doc(folderpath):
|
| 21 |
-
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 22 |
-
header_texts, body_texts, footer_texts = [], [], []
|
| 23 |
-
paginated_dict = {}
|
| 24 |
-
for (i, p) in enumerate(doc):
|
| 25 |
-
ht, bt, ft = parse_page(folderpath, i)
|
| 26 |
-
if "preliminary print" in ht.lower(): # Skip cover page
|
| 27 |
-
continue
|
| 28 |
-
body_texts.append(bt)
|
| 29 |
-
header_texts.append(ht)
|
| 30 |
-
footer_texts.append(ft)
|
| 31 |
-
paginated_dict[i] = (ht, bt, ft)
|
| 32 |
-
return header_texts, body_texts, footer_texts, paginated_dict
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def parse_page(folderpath, pg_ind):
|
| 36 |
-
df = pd.read_csv(folderpath + '/data.csv')
|
| 37 |
-
|
| 38 |
-
header_text, body_text, footer_text = None, None, None
|
| 39 |
-
page_df = df[df['Pg Ind'] == pg_ind].to_dict('records')[0]
|
| 40 |
-
header = [page_df['Header X1'], page_df['Header Y1'], page_df['Header X2'], page_df['Header Y2']]
|
| 41 |
-
body = [page_df['Body X1'], page_df['Body Y1'], page_df['Body X2'], page_df['Body Y2']]
|
| 42 |
-
footer = [page_df['Footer X1'], page_df['Footer Y1'], page_df['Footer X2'], page_df['Footer Y2']]
|
| 43 |
-
case_split = page_df['Case Separator Y']
|
| 44 |
-
body_rect = fitz.Rect(body[0], body[1], body[2], body[3])
|
| 45 |
-
header_rect = fitz.Rect(header[0], header[1], header[2], header[3])
|
| 46 |
-
footer_rect = fitz.Rect(footer[0], footer[1], footer[2], footer[3])
|
| 47 |
-
|
| 48 |
-
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 49 |
-
page = doc[pg_ind]
|
| 50 |
-
header_text = page.get_text("text", clip=header_rect).strip().replace('Page Proof Pending Publication', '')
|
| 51 |
-
body_text = page.get_text("text", clip=body_rect).strip().replace('Page Proof Pending Publication', '')
|
| 52 |
-
if str(footer_rect[0]) != "nan":
|
| 53 |
-
footer_text = page.get_text("text", clip=footer_rect).strip().replace('Page Proof Pending Publication', '')
|
| 54 |
-
return header_text, body_text, footer_text
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def get_splits(folderpath):
|
| 58 |
-
header_texts, body_texts, footer_texts, paginated_dict = parse_doc(folderpath)
|
| 59 |
-
full_body_text = "\n".join(body_texts).replace('-', '')
|
| 60 |
-
full_body_text = correct(full_body_text, "justice")
|
| 61 |
-
|
| 62 |
-
split_p = re.compile('((\n|^)\s*Per Curiam\.\s*\n)|(Justice[A-z\s\n,]*delivered the opinion)|((\n|^)\s*(mr\.\s*)?justice[A-Za-z\n\s,–-]*(concurring|dissenting)[A-Za-z\n\s,–]*\.)', re.IGNORECASE)
|
| 63 |
-
# ((\n|^)\s*(Mr\.\s*(chief)?\s*)?Justice[A-z\s\n,]*delivered the opinion)
|
| 64 |
-
splits_m = list(re.finditer(split_p, full_body_text))
|
| 65 |
-
splits = []
|
| 66 |
-
|
| 67 |
-
if len(splits_m) > 0:
|
| 68 |
-
print("---Found split---")
|
| 69 |
-
i = 0
|
| 70 |
-
while i <= len(splits_m):
|
| 71 |
-
if i == 0:
|
| 72 |
-
start = 0
|
| 73 |
-
else:
|
| 74 |
-
start = splits_m[i - 1].span()[0]
|
| 75 |
-
if i == len(splits_m):
|
| 76 |
-
splits.append(full_body_text[start:].strip())
|
| 77 |
-
else:
|
| 78 |
-
splits.append(full_body_text[start:splits_m[i].span()[0]].strip())
|
| 79 |
-
i = i + 1
|
| 80 |
-
return splits, paginated_dict
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def get_split_data(split):
|
| 84 |
-
txt = split[0:300]
|
| 85 |
-
d = nlp(txt)
|
| 86 |
-
first_sent = list(d.sents)[0]
|
| 87 |
-
first_sent_text = " ".join([t.text for t in first_sent])
|
| 88 |
-
ents = nlp(first_sent_text).ents
|
| 89 |
-
person_ents = [e.text.lower().split('tice')[-1].strip().capitalize() for e in ents if e.label_ == "PERSON"]
|
| 90 |
-
if "chief justice" in first_sent_text.lower():
|
| 91 |
-
person_ents.append("Chief")
|
| 92 |
-
opinion_type, author, joining = None, None, []
|
| 93 |
-
if "delivered" in first_sent_text:
|
| 94 |
-
author = person_ents[0]
|
| 95 |
-
joining = []
|
| 96 |
-
opinion_type = "majority"
|
| 97 |
-
if "per curiam" in first_sent_text.lower():
|
| 98 |
-
author = "Per Curiam"
|
| 99 |
-
joining = []
|
| 100 |
-
opinion_type = "majority"
|
| 101 |
-
if "concurring" in first_sent_text:
|
| 102 |
-
author = person_ents[0]
|
| 103 |
-
joining = person_ents[1:]
|
| 104 |
-
opinion_type = "concurrence"
|
| 105 |
-
if "dissenting" in first_sent_text:
|
| 106 |
-
author = person_ents[0]
|
| 107 |
-
joining = person_ents[1:]
|
| 108 |
-
opinion_type = "dissent"
|
| 109 |
-
if opinion_type == None:
|
| 110 |
-
opinion_type = "pre"
|
| 111 |
-
return opinion_type, author, joining
|
| 112 |
-
|
| 113 |
-
def court_from_year(date_time):
|
| 114 |
-
df = pd.read_csv('Justices Table.csv')
|
| 115 |
-
justice_dict = df.to_dict('records')
|
| 116 |
-
court_year = {'Associate':[], 'Chief': None}
|
| 117 |
-
for j in justice_dict:
|
| 118 |
-
start = datetime.datetime.strptime(j['Start'], '%Y-%m-%d')
|
| 119 |
-
if str(j['End']) != "nan":
|
| 120 |
-
end = datetime.datetime.strptime(j['End'], '%Y-%m-%d')
|
| 121 |
-
else:
|
| 122 |
-
end = datetime.datetime.now()
|
| 123 |
-
if date_time > start and date_time < end:
|
| 124 |
-
name = j['Name']
|
| 125 |
-
if "Associate" in name:
|
| 126 |
-
court_year['Associate'].append(name.split('(Associate Justice)')[0].split(', ')[0].strip().split(' ')[-1])
|
| 127 |
-
if "Chief" in name:
|
| 128 |
-
court_year['Chief'] = name.split('(Chief Justice)')[0].split(', ')[0].strip()
|
| 129 |
-
return court_year
|
| 130 |
-
|
| 131 |
-
def correct(corpus, keyword):
|
| 132 |
-
words = corpus.split(' ')
|
| 133 |
-
potential_targets = []
|
| 134 |
-
for (i, w) in enumerate(words):
|
| 135 |
-
d = distance(keyword, w.lower())
|
| 136 |
-
if d < 2 and d > 0:
|
| 137 |
-
potential_targets.append((i, w))
|
| 138 |
-
|
| 139 |
-
for (ind, pt) in potential_targets:
|
| 140 |
-
word = Word(pt.lower())
|
| 141 |
-
result = word.spellcheck()
|
| 142 |
-
if result[0][1] > 0.9 and result[0][0].lower() != pt.lower():
|
| 143 |
-
if "\n" in pt:
|
| 144 |
-
words[ind] = "\n" + result[0][0]
|
| 145 |
-
else:
|
| 146 |
-
words[ind] = result[0][0]
|
| 147 |
-
return " ".join(words)
|
| 148 |
-
|
| 149 |
-
def closest_justice(name, datetime):
|
| 150 |
-
cy = court_from_year(datetime)
|
| 151 |
-
justices = cy['Associate']
|
| 152 |
-
if cy['Chief'] is not None:
|
| 153 |
-
justices += [cy['Chief']]
|
| 154 |
-
if name.capitalize() not in justices:
|
| 155 |
-
scores = [distance(j, name) for (i,j) in enumerate(justices)]
|
| 156 |
-
closest_name = justices[np.argmin(scores)]
|
| 157 |
-
if closest_name.capitalize() == cy['Chief']:
|
| 158 |
-
closest_name = "Chief"
|
| 159 |
-
return closest_name
|
| 160 |
-
else:
|
| 161 |
-
return name
|
| 162 |
-
|
| 163 |
-
class Opinion:
|
| 164 |
-
def __init__(self, opinion_type, author, joining, body_text, fn_text, header_text):
|
| 165 |
-
self.opinion_type = opinion_type
|
| 166 |
-
self.author = author
|
| 167 |
-
self.joining = joining
|
| 168 |
-
self.body_text = body_text
|
| 169 |
-
self.fn_text = fn_text
|
| 170 |
-
self.header_text = header_text
|
| 171 |
-
|
| 172 |
-
class Case:
|
| 173 |
-
def __init__(self, paginated_dict):
|
| 174 |
-
self.paginated_dict = paginated_dict
|
| 175 |
-
self.majority, self.concurrences, self.dissents, self.pre = None, [], [], None
|
| 176 |
-
self.date, self.case_name, self.case_citation, self.page_numbers = None, "", None, []
|
| 177 |
-
self.recused = []
|
| 178 |
-
self.cert_info = None
|
| 179 |
-
|
| 180 |
-
def get_date(self):
|
| 181 |
-
print("Extracting Date")
|
| 182 |
-
if self.pre is None:
|
| 183 |
-
print(self.paginated_dict)
|
| 184 |
-
doc = nlp(self.pre.body_text[0:2000])
|
| 185 |
-
sents = list(doc.sents)
|
| 186 |
-
for s in sents:
|
| 187 |
-
if "Decided" in s.text:
|
| 188 |
-
date_extract = s.text.replace('\n', '').split('Decided')[-1].strip().replace('.', '')
|
| 189 |
-
pattern = re.compile('Decided\s*\w*\s*[0-9]{1,2}[\.,]\s?[0-9]{4}')
|
| 190 |
-
match = re.search(pattern, s.text)
|
| 191 |
-
date_extract = s.text[match.span()[0]:match.span()[1]].split('Decided')[-1].strip()
|
| 192 |
-
date = dateparser.parse(date_extract)
|
| 193 |
-
self.date = date
|
| 194 |
-
return
|
| 195 |
-
|
| 196 |
-
def update_recused(self):
|
| 197 |
-
print("Identifying recused")
|
| 198 |
-
p = re.compile('(?:justice )[\w\s]*(?: took no part)', re.IGNORECASE)
|
| 199 |
-
m = re.search(p, self.majority.body_text)
|
| 200 |
-
if m is not None:
|
| 201 |
-
recused_span = self.majority.body_text[m.span()[0]:m.span()[1]].lower()
|
| 202 |
-
doc = nlp(recused_span)
|
| 203 |
-
self.recused = [e.text.split('justice')[-1].upper().strip().capitalize() for e in doc.ents if
|
| 204 |
-
e.label_ == "PERSON"]
|
| 205 |
-
if "chief justice" in recused_span:
|
| 206 |
-
self.recused.append("Chief")
|
| 207 |
-
|
| 208 |
-
def update_majority_joining(self):
|
| 209 |
-
print("Getting updated list")
|
| 210 |
-
cy = court_from_year(self.date)
|
| 211 |
-
known = [j for d in self.dissents for j in d.joining] + [d.author for d in self.dissents] + [j for c in
|
| 212 |
-
self.concurrences
|
| 213 |
-
for j in
|
| 214 |
-
c.joining] + [
|
| 215 |
-
c.author for c in self.concurrences] + [self.majority.author] + [r for r in self.recused]
|
| 216 |
-
all_justices = [aj for aj in cy['Associate']]
|
| 217 |
-
if cy['Chief'] is not None:
|
| 218 |
-
all_justices.append('Chief')
|
| 219 |
-
self.majority.joining = [aj for aj in all_justices if aj not in known]
|
| 220 |
-
|
| 221 |
-
def get_cert_info(self):
|
| 222 |
-
print("Extracting Cert Info")
|
| 223 |
-
lines = self.pre.body_text.split('\n')
|
| 224 |
-
start = -1
|
| 225 |
-
end = -1
|
| 226 |
-
for (i, l) in enumerate(lines):
|
| 227 |
-
if "petition" in l.lower() or "cert" in l.lower() or "error" in l.lower() or "appeal" in l.lower() or "on" in l.lower().split(' '):
|
| 228 |
-
start = i
|
| 229 |
-
if "no." in l.lower() or "nos." in l.lower() or "argued" in l.lower() or "decided" in l.lower():
|
| 230 |
-
end = i
|
| 231 |
-
break
|
| 232 |
-
self.cert_info = " ".join(lines[start:end]).strip().upper().replace(' ', ' ').replace('.', '')
|
| 233 |
-
|
| 234 |
-
def get_case_name_cite_pns(self):
|
| 235 |
-
lines_total = [l for p in self.paginated_dict for l in self.paginated_dict[p][0].split('\n')[:-1]]
|
| 236 |
-
lines_selected = []
|
| 237 |
-
p = re.compile('(october|per curiam|opinion of|concur|dissent|statement of|argument|syllabus|[0-9] ?U.)', re.IGNORECASE)
|
| 238 |
-
for l in lines_total:
|
| 239 |
-
m = re.search(p, l)
|
| 240 |
-
if m is None and not l.lower().strip().isnumeric():
|
| 241 |
-
lines_selected.append(l)
|
| 242 |
-
self.case_name = mode(lines_selected)
|
| 243 |
-
|
| 244 |
-
p = re.compile('[0-9]*\s?U\.\s?S\. ?([0-9]|_)*', re.IGNORECASE)
|
| 245 |
-
lines_selected = []
|
| 246 |
-
for l in lines_total:
|
| 247 |
-
m = re.search(p, l)
|
| 248 |
-
if m is not None:
|
| 249 |
-
self.case_citation = l[m.span()[0]:m.span()[1]]
|
| 250 |
-
break
|
| 251 |
-
|
| 252 |
-
p = re.compile('^\s?[0-9]+\s?$', re.IGNORECASE)
|
| 253 |
-
page_lines = [self.paginated_dict[p][0].split('\n') for p in self.paginated_dict]
|
| 254 |
-
self.page_numbers = []
|
| 255 |
-
for pl in page_lines:
|
| 256 |
-
numeric_on_page = []
|
| 257 |
-
for l in pl:
|
| 258 |
-
matches = list(re.finditer(p, l))
|
| 259 |
-
for m in matches:
|
| 260 |
-
possibility = int(l[m.span()[0]:m.span()[1]].strip())
|
| 261 |
-
numeric_on_page.append(possibility)
|
| 262 |
-
if len(numeric_on_page) == 0:
|
| 263 |
-
if len(self.page_numbers) > 0:
|
| 264 |
-
self.page_numbers.append(self.page_numbers[-1] + 1)
|
| 265 |
-
else:
|
| 266 |
-
self.page_numbers.append(1)
|
| 267 |
-
if len(numeric_on_page) > 0:
|
| 268 |
-
page_number = max(numeric_on_page)
|
| 269 |
-
if len(self.page_numbers) > 0:
|
| 270 |
-
page_number = max(page_number, self.page_numbers[-1] + 1)
|
| 271 |
-
self.page_numbers.append(page_number)
|
| 272 |
-
|
| 273 |
-
if self.case_citation is not None and self.case_citation.lower().split('s.')[-1].strip() == "":
|
| 274 |
-
self.case_citation = self.case_citation.strip() + ' ' + str(self.page_numbers[0])
|
| 275 |
-
|
| 276 |
-
def update_justice_names(self):
|
| 277 |
-
if self.majority.author.lower() != "per curiam":
|
| 278 |
-
self.majority.author = closest_justice(self.majority.author, self.date)
|
| 279 |
-
for (i,cons) in enumerate(self.concurrences):
|
| 280 |
-
self.concurrences[i].author = closest_justice(self.concurrences[i].author, self.date)
|
| 281 |
-
for (i,dissents) in enumerate(self.dissents):
|
| 282 |
-
self.dissents[i].author = closest_justice(self.dissents[i].author, self.date)
|
| 283 |
-
return
|
| 284 |
-
|
| 285 |
-
def process(self):
|
| 286 |
-
self.get_date()
|
| 287 |
-
self.update_justice_names()
|
| 288 |
-
self.update_recused()
|
| 289 |
-
self.update_majority_joining()
|
| 290 |
-
self.get_cert_info()
|
| 291 |
-
self.get_case_name_cite_pns()
|
| 292 |
-
|
| 293 |
-
def run(folderpath):
|
| 294 |
-
splits, paginated_dict = get_splits(folderpath)
|
| 295 |
-
C = Case(paginated_dict=paginated_dict)
|
| 296 |
-
ops = []
|
| 297 |
-
for s in splits:
|
| 298 |
-
opinion_type, author, joining = get_split_data(s)
|
| 299 |
-
if opinion_type is not None:
|
| 300 |
-
op = Opinion(opinion_type, author, joining, s, fn_text=None, header_text=None)
|
| 301 |
-
if opinion_type == "majority":
|
| 302 |
-
C.majority = op
|
| 303 |
-
if opinion_type == "concurrence":
|
| 304 |
-
C.concurrences.append(op)
|
| 305 |
-
if opinion_type == "dissent":
|
| 306 |
-
C.dissents.append(op)
|
| 307 |
-
if opinion_type == "pre":
|
| 308 |
-
C.pre = op
|
| 309 |
-
ops.append(op)
|
| 310 |
-
|
| 311 |
-
C.process()
|
| 312 |
-
return C
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|