cools commited on
Commit
b8fa3fe
·
1 Parent(s): 2427ffa

Delete Parser.py

Browse files
Files changed (1) hide show
  1. Parser.py +0 -312
Parser.py DELETED
@@ -1,312 +0,0 @@
1
- import fitz
2
- import numpy as np
3
- import os
4
- import pandas as pd
5
- import re
6
- import datetime
7
- import pytesseract
8
- import cv2
9
- import warnings
10
- import ocrmypdf
11
- import spacy
12
- import dateparser
13
- import statistics
14
- from statistics import mode
15
- from textblob import Word
16
- from Levenshtein import distance
17
-
18
- nlp = spacy.load('en_core_web_trf')
19
-
20
- def parse_doc(folderpath):
21
- doc = fitz.open(folderpath + '/opinion.pdf')
22
- header_texts, body_texts, footer_texts = [], [], []
23
- paginated_dict = {}
24
- for (i, p) in enumerate(doc):
25
- ht, bt, ft = parse_page(folderpath, i)
26
- if "preliminary print" in ht.lower(): # Skip cover page
27
- continue
28
- body_texts.append(bt)
29
- header_texts.append(ht)
30
- footer_texts.append(ft)
31
- paginated_dict[i] = (ht, bt, ft)
32
- return header_texts, body_texts, footer_texts, paginated_dict
33
-
34
-
35
- def parse_page(folderpath, pg_ind):
36
- df = pd.read_csv(folderpath + '/data.csv')
37
-
38
- header_text, body_text, footer_text = None, None, None
39
- page_df = df[df['Pg Ind'] == pg_ind].to_dict('records')[0]
40
- header = [page_df['Header X1'], page_df['Header Y1'], page_df['Header X2'], page_df['Header Y2']]
41
- body = [page_df['Body X1'], page_df['Body Y1'], page_df['Body X2'], page_df['Body Y2']]
42
- footer = [page_df['Footer X1'], page_df['Footer Y1'], page_df['Footer X2'], page_df['Footer Y2']]
43
- case_split = page_df['Case Separator Y']
44
- body_rect = fitz.Rect(body[0], body[1], body[2], body[3])
45
- header_rect = fitz.Rect(header[0], header[1], header[2], header[3])
46
- footer_rect = fitz.Rect(footer[0], footer[1], footer[2], footer[3])
47
-
48
- doc = fitz.open(folderpath + '/opinion.pdf')
49
- page = doc[pg_ind]
50
- header_text = page.get_text("text", clip=header_rect).strip().replace('Page Proof Pending Publication', '')
51
- body_text = page.get_text("text", clip=body_rect).strip().replace('Page Proof Pending Publication', '')
52
- if str(footer_rect[0]) != "nan":
53
- footer_text = page.get_text("text", clip=footer_rect).strip().replace('Page Proof Pending Publication', '')
54
- return header_text, body_text, footer_text
55
-
56
-
57
- def get_splits(folderpath):
58
- header_texts, body_texts, footer_texts, paginated_dict = parse_doc(folderpath)
59
- full_body_text = "\n".join(body_texts).replace('-', '')
60
- full_body_text = correct(full_body_text, "justice")
61
-
62
- split_p = re.compile('((\n|^)\s*Per Curiam\.\s*\n)|(Justice[A-z\s\n,]*delivered the opinion)|((\n|^)\s*(mr\.\s*)?justice[A-Za-z\n\s,–-]*(concurring|dissenting)[A-Za-z\n\s,–]*\.)', re.IGNORECASE)
63
- # ((\n|^)\s*(Mr\.\s*(chief)?\s*)?Justice[A-z\s\n,]*delivered the opinion)
64
- splits_m = list(re.finditer(split_p, full_body_text))
65
- splits = []
66
-
67
- if len(splits_m) > 0:
68
- print("---Found split---")
69
- i = 0
70
- while i <= len(splits_m):
71
- if i == 0:
72
- start = 0
73
- else:
74
- start = splits_m[i - 1].span()[0]
75
- if i == len(splits_m):
76
- splits.append(full_body_text[start:].strip())
77
- else:
78
- splits.append(full_body_text[start:splits_m[i].span()[0]].strip())
79
- i = i + 1
80
- return splits, paginated_dict
81
-
82
-
83
- def get_split_data(split):
84
- txt = split[0:300]
85
- d = nlp(txt)
86
- first_sent = list(d.sents)[0]
87
- first_sent_text = " ".join([t.text for t in first_sent])
88
- ents = nlp(first_sent_text).ents
89
- person_ents = [e.text.lower().split('tice')[-1].strip().capitalize() for e in ents if e.label_ == "PERSON"]
90
- if "chief justice" in first_sent_text.lower():
91
- person_ents.append("Chief")
92
- opinion_type, author, joining = None, None, []
93
- if "delivered" in first_sent_text:
94
- author = person_ents[0]
95
- joining = []
96
- opinion_type = "majority"
97
- if "per curiam" in first_sent_text.lower():
98
- author = "Per Curiam"
99
- joining = []
100
- opinion_type = "majority"
101
- if "concurring" in first_sent_text:
102
- author = person_ents[0]
103
- joining = person_ents[1:]
104
- opinion_type = "concurrence"
105
- if "dissenting" in first_sent_text:
106
- author = person_ents[0]
107
- joining = person_ents[1:]
108
- opinion_type = "dissent"
109
- if opinion_type == None:
110
- opinion_type = "pre"
111
- return opinion_type, author, joining
112
-
113
- def court_from_year(date_time):
114
- df = pd.read_csv('Justices Table.csv')
115
- justice_dict = df.to_dict('records')
116
- court_year = {'Associate':[], 'Chief': None}
117
- for j in justice_dict:
118
- start = datetime.datetime.strptime(j['Start'], '%Y-%m-%d')
119
- if str(j['End']) != "nan":
120
- end = datetime.datetime.strptime(j['End'], '%Y-%m-%d')
121
- else:
122
- end = datetime.datetime.now()
123
- if date_time > start and date_time < end:
124
- name = j['Name']
125
- if "Associate" in name:
126
- court_year['Associate'].append(name.split('(Associate Justice)')[0].split(', ')[0].strip().split(' ')[-1])
127
- if "Chief" in name:
128
- court_year['Chief'] = name.split('(Chief Justice)')[0].split(', ')[0].strip()
129
- return court_year
130
-
131
- def correct(corpus, keyword):
132
- words = corpus.split(' ')
133
- potential_targets = []
134
- for (i, w) in enumerate(words):
135
- d = distance(keyword, w.lower())
136
- if d < 2 and d > 0:
137
- potential_targets.append((i, w))
138
-
139
- for (ind, pt) in potential_targets:
140
- word = Word(pt.lower())
141
- result = word.spellcheck()
142
- if result[0][1] > 0.9 and result[0][0].lower() != pt.lower():
143
- if "\n" in pt:
144
- words[ind] = "\n" + result[0][0]
145
- else:
146
- words[ind] = result[0][0]
147
- return " ".join(words)
148
-
149
- def closest_justice(name, datetime):
150
- cy = court_from_year(datetime)
151
- justices = cy['Associate']
152
- if cy['Chief'] is not None:
153
- justices += [cy['Chief']]
154
- if name.capitalize() not in justices:
155
- scores = [distance(j, name) for (i,j) in enumerate(justices)]
156
- closest_name = justices[np.argmin(scores)]
157
- if closest_name.capitalize() == cy['Chief']:
158
- closest_name = "Chief"
159
- return closest_name
160
- else:
161
- return name
162
-
163
- class Opinion:
164
- def __init__(self, opinion_type, author, joining, body_text, fn_text, header_text):
165
- self.opinion_type = opinion_type
166
- self.author = author
167
- self.joining = joining
168
- self.body_text = body_text
169
- self.fn_text = fn_text
170
- self.header_text = header_text
171
-
172
- class Case:
173
- def __init__(self, paginated_dict):
174
- self.paginated_dict = paginated_dict
175
- self.majority, self.concurrences, self.dissents, self.pre = None, [], [], None
176
- self.date, self.case_name, self.case_citation, self.page_numbers = None, "", None, []
177
- self.recused = []
178
- self.cert_info = None
179
-
180
- def get_date(self):
181
- print("Extracting Date")
182
- if self.pre is None:
183
- print(self.paginated_dict)
184
- doc = nlp(self.pre.body_text[0:2000])
185
- sents = list(doc.sents)
186
- for s in sents:
187
- if "Decided" in s.text:
188
- date_extract = s.text.replace('\n', '').split('Decided')[-1].strip().replace('.', '')
189
- pattern = re.compile('Decided\s*\w*\s*[0-9]{1,2}[\.,]\s?[0-9]{4}')
190
- match = re.search(pattern, s.text)
191
- date_extract = s.text[match.span()[0]:match.span()[1]].split('Decided')[-1].strip()
192
- date = dateparser.parse(date_extract)
193
- self.date = date
194
- return
195
-
196
- def update_recused(self):
197
- print("Identifying recused")
198
- p = re.compile('(?:justice )[\w\s]*(?: took no part)', re.IGNORECASE)
199
- m = re.search(p, self.majority.body_text)
200
- if m is not None:
201
- recused_span = self.majority.body_text[m.span()[0]:m.span()[1]].lower()
202
- doc = nlp(recused_span)
203
- self.recused = [e.text.split('justice')[-1].upper().strip().capitalize() for e in doc.ents if
204
- e.label_ == "PERSON"]
205
- if "chief justice" in recused_span:
206
- self.recused.append("Chief")
207
-
208
- def update_majority_joining(self):
209
- print("Getting updated list")
210
- cy = court_from_year(self.date)
211
- known = [j for d in self.dissents for j in d.joining] + [d.author for d in self.dissents] + [j for c in
212
- self.concurrences
213
- for j in
214
- c.joining] + [
215
- c.author for c in self.concurrences] + [self.majority.author] + [r for r in self.recused]
216
- all_justices = [aj for aj in cy['Associate']]
217
- if cy['Chief'] is not None:
218
- all_justices.append('Chief')
219
- self.majority.joining = [aj for aj in all_justices if aj not in known]
220
-
221
- def get_cert_info(self):
222
- print("Extracting Cert Info")
223
- lines = self.pre.body_text.split('\n')
224
- start = -1
225
- end = -1
226
- for (i, l) in enumerate(lines):
227
- if "petition" in l.lower() or "cert" in l.lower() or "error" in l.lower() or "appeal" in l.lower() or "on" in l.lower().split(' '):
228
- start = i
229
- if "no." in l.lower() or "nos." in l.lower() or "argued" in l.lower() or "decided" in l.lower():
230
- end = i
231
- break
232
- self.cert_info = " ".join(lines[start:end]).strip().upper().replace(' ', ' ').replace('.', '')
233
-
234
- def get_case_name_cite_pns(self):
235
- lines_total = [l for p in self.paginated_dict for l in self.paginated_dict[p][0].split('\n')[:-1]]
236
- lines_selected = []
237
- p = re.compile('(october|per curiam|opinion of|concur|dissent|statement of|argument|syllabus|[0-9] ?U.)', re.IGNORECASE)
238
- for l in lines_total:
239
- m = re.search(p, l)
240
- if m is None and not l.lower().strip().isnumeric():
241
- lines_selected.append(l)
242
- self.case_name = mode(lines_selected)
243
-
244
- p = re.compile('[0-9]*\s?U\.\s?S\. ?([0-9]|_)*', re.IGNORECASE)
245
- lines_selected = []
246
- for l in lines_total:
247
- m = re.search(p, l)
248
- if m is not None:
249
- self.case_citation = l[m.span()[0]:m.span()[1]]
250
- break
251
-
252
- p = re.compile('^\s?[0-9]+\s?$', re.IGNORECASE)
253
- page_lines = [self.paginated_dict[p][0].split('\n') for p in self.paginated_dict]
254
- self.page_numbers = []
255
- for pl in page_lines:
256
- numeric_on_page = []
257
- for l in pl:
258
- matches = list(re.finditer(p, l))
259
- for m in matches:
260
- possibility = int(l[m.span()[0]:m.span()[1]].strip())
261
- numeric_on_page.append(possibility)
262
- if len(numeric_on_page) == 0:
263
- if len(self.page_numbers) > 0:
264
- self.page_numbers.append(self.page_numbers[-1] + 1)
265
- else:
266
- self.page_numbers.append(1)
267
- if len(numeric_on_page) > 0:
268
- page_number = max(numeric_on_page)
269
- if len(self.page_numbers) > 0:
270
- page_number = max(page_number, self.page_numbers[-1] + 1)
271
- self.page_numbers.append(page_number)
272
-
273
- if self.case_citation is not None and self.case_citation.lower().split('s.')[-1].strip() == "":
274
- self.case_citation = self.case_citation.strip() + ' ' + str(self.page_numbers[0])
275
-
276
- def update_justice_names(self):
277
- if self.majority.author.lower() != "per curiam":
278
- self.majority.author = closest_justice(self.majority.author, self.date)
279
- for (i,cons) in enumerate(self.concurrences):
280
- self.concurrences[i].author = closest_justice(self.concurrences[i].author, self.date)
281
- for (i,dissents) in enumerate(self.dissents):
282
- self.dissents[i].author = closest_justice(self.dissents[i].author, self.date)
283
- return
284
-
285
- def process(self):
286
- self.get_date()
287
- self.update_justice_names()
288
- self.update_recused()
289
- self.update_majority_joining()
290
- self.get_cert_info()
291
- self.get_case_name_cite_pns()
292
-
293
- def run(folderpath):
294
- splits, paginated_dict = get_splits(folderpath)
295
- C = Case(paginated_dict=paginated_dict)
296
- ops = []
297
- for s in splits:
298
- opinion_type, author, joining = get_split_data(s)
299
- if opinion_type is not None:
300
- op = Opinion(opinion_type, author, joining, s, fn_text=None, header_text=None)
301
- if opinion_type == "majority":
302
- C.majority = op
303
- if opinion_type == "concurrence":
304
- C.concurrences.append(op)
305
- if opinion_type == "dissent":
306
- C.dissents.append(op)
307
- if opinion_type == "pre":
308
- C.pre = op
309
- ops.append(op)
310
-
311
- C.process()
312
- return C