cools commited on
Commit
30420b9
·
1 Parent(s): 96dd787

Upload 4 files

Browse files
Files changed (4) hide show
  1. ImageProcessor.py +148 -0
  2. Manager.py +10 -0
  3. Parser.py +287 -0
  4. Scraper.py +40 -0
ImageProcessor.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import fitz
3
+ import numpy as np
4
+ import os
5
+ import pandas as pd
6
+ import pytesseract
7
+ import warnings
8
+
9
+ def pdf2png(folderpath):
10
+ doc = fitz.open(folderpath + '/opinion.pdf')
11
+ zoom = 1
12
+ mat = fitz.Matrix(zoom, zoom)
13
+ for (i, p) in enumerate(doc):
14
+ pix = p.get_pixmap(matrix=mat)
15
+ pix.save(folderpath + '/' + str(i) + '.png')
16
+
17
+ def get_footnote_bbox(filename):
18
+ footnotes_bbox = (None, None, None, None)
19
+ x1p, y1p, x2p, y2p = get_page_bbox(filename)
20
+ x1h, y1h, x2h, y2h = get_header_bbox(filename)
21
+ image = cv2.imread(filename)
22
+ im_h, im_w, im_d = image.shape
23
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
24
+ thresh = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)[1]
25
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
26
+ dilate = cv2.dilate(thresh, kernel, iterations=1)
27
+ cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
28
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
29
+ cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
30
+ for (i, c) in enumerate(cnts):
31
+ x, y, w, h = cv2.boundingRect(c)
32
+ if h < 7 and w > 50 and y > y1p and x - x1p < 30:
33
+ footnotes_bbox = (x, y, x2p, y2p)
34
+ return footnotes_bbox
35
+
36
+ def get_header_bbox(filename):
37
+ image = cv2.imread(filename)
38
+ im_h, im_w, im_d = image.shape
39
+ base_image = image.copy()
40
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
41
+ blur = cv2.GaussianBlur(gray, (9,9), 0)
42
+ thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
43
+
44
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (200,10))
45
+ dilate = cv2.dilate(thresh, kernel, iterations=1)
46
+
47
+ cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
48
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
49
+ cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
50
+ for (i,c) in enumerate(cnts):
51
+ x,y,w,h = cv2.boundingRect(c)
52
+ break
53
+ header_bbox = (x, y, x+w, y+40)
54
+ # header_bbox = (145, 45, 465, 155) # For digitized variants
55
+ return header_bbox
56
+
57
+
58
+ def get_page_bbox(filename):
59
+ image = cv2.imread(filename)
60
+ im_h, im_w, im_d = image.shape
61
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
62
+ blur = cv2.GaussianBlur(gray, (7, 7), 0)
63
+ thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
64
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 10))
65
+ dilate = cv2.dilate(thresh, kernel, iterations=1)
66
+ cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
67
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
68
+ cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
69
+
70
+ header_bbox = get_header_bbox(filename)
71
+ all_x1 = [cv2.boundingRect(c)[0] for c in cnts]
72
+ all_y1 = [cv2.boundingRect(c)[1] for c in cnts]
73
+ all_x2 = [cv2.boundingRect(c)[0] + cv2.boundingRect(c)[2] for c in cnts]
74
+ all_y2 = [cv2.boundingRect(c)[1] + cv2.boundingRect(c)[3] for c in cnts]
75
+ return min(all_x1), header_bbox[1], max(all_x2), max(all_y2)
76
+
77
+ def get_case_separator(filename):
78
+ new_case_line = (None, None, None, None)
79
+ x1p, y1p, x2p, y2p = get_page_bbox(filename)
80
+ x1h, y1h, x2h, y2h = get_header_bbox(filename)
81
+
82
+ image = cv2.imread(filename)
83
+ im_h, im_w, im_d = image.shape
84
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
85
+ blur = cv2.GaussianBlur(gray, (7, 7), 0)
86
+ thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV)[1]
87
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
88
+ dilate = cv2.dilate(thresh, kernel, iterations=1)
89
+ cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
90
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
91
+ cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
92
+ for (i, c) in enumerate(cnts):
93
+ x, y, w, h = cv2.boundingRect(c)
94
+ x_center = (x1p + x2p) / 2
95
+ if h < 8 and w > 70 and ((x - x1p) < x_center and (x - x1p) > 0.3 * x_center) and (y > y1p and y > y1h): #
96
+ new_case_line = (x1p, y, x2p, y)
97
+ break
98
+ return new_case_line
99
+
100
+ def get_page_elements(filename):
101
+ page_bbox = get_page_bbox(filename)
102
+ header_bbox = get_header_bbox(filename)
103
+ fn_bbox = get_footnote_bbox(filename)
104
+ case_separator_bbox = get_case_separator(filename)
105
+ if fn_bbox[0] is not None:
106
+ body_bbox = (page_bbox[0], header_bbox[3], page_bbox[2], fn_bbox[1])
107
+ else:
108
+ body_bbox = (page_bbox[0], header_bbox[3], page_bbox[2], page_bbox[3])
109
+
110
+ image = cv2.imread(filename)
111
+ cv2.rectangle(image, (page_bbox[0], page_bbox[1]), (page_bbox[2], page_bbox[3]), (0, 0, 0), 4)
112
+ cv2.rectangle(image, (header_bbox[0], header_bbox[1]), (header_bbox[2], header_bbox[3]), (0, 255, 0), 2)
113
+ cv2.rectangle(image, (body_bbox[0], body_bbox[1]), (body_bbox[2], body_bbox[3]), (255, 0, 0), 2)
114
+ if fn_bbox[0] is not None:
115
+ cv2.rectangle(image, (fn_bbox[0], fn_bbox[1]), (fn_bbox[2], fn_bbox[3]), (0, 0, 255), 2)
116
+ if case_separator_bbox[0] is not None:
117
+ cv2.rectangle(image, (case_separator_bbox[0], case_separator_bbox[1]),
118
+ (case_separator_bbox[2], case_separator_bbox[3]), (255, 0, 255), 2)
119
+
120
+ return page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, image
121
+
122
+ def process_file(folderpath):
123
+ pdf2png(folderpath)
124
+ files = [f for f in os.listdir(folderpath) if '.png' in f.lower() and "processed" not in f.lower()]
125
+ data = {'Pg Ind':[],
126
+ 'Header X1':[], 'Header Y1': [], 'Header X2': [], 'Header Y2':[],
127
+ 'Body X1':[], 'Body Y1': [], 'Body X2': [], 'Body Y2':[],
128
+ 'Footer X1':[], 'Footer Y1': [], 'Footer X2': [], 'Footer Y2':[],
129
+ 'Page X1':[], 'Page Y1': [], 'Page X2': [], 'Page Y2':[],
130
+ 'Case Separator Y': [],
131
+ }
132
+ data_df = pd.DataFrame(data)
133
+ for (i,f) in enumerate(files):
134
+ page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, image = get_page_elements(folderpath +'/' + f)
135
+ ind = int(f.split('.png')[0])
136
+ row = {'Pg Ind':[ind],
137
+ 'Header X1':[header_bbox[0]], 'Header Y1': [header_bbox[1]], 'Header X2': [header_bbox[2]], 'Header Y2':[header_bbox[3]],
138
+ 'Body X1':[body_bbox[0]], 'Body Y1': [body_bbox[1]], 'Body X2': [body_bbox[2]], 'Body Y2':[body_bbox[3]],
139
+ 'Footer X1':[fn_bbox[0]], 'Footer Y1': [fn_bbox[1]], 'Footer X2': [fn_bbox[2]], 'Footer Y2':[fn_bbox[3]],
140
+ 'Page X1':[page_bbox[0]], 'Page Y1': [page_bbox[1]], 'Page X2': [page_bbox[2]], 'Page Y2':[page_bbox[3]],
141
+ 'Case Separator Y': [case_separator_bbox[1]]
142
+ }
143
+ row_df = pd.DataFrame(row)
144
+ data_df = pd.concat([data_df, row_df], ignore_index=True)
145
+ cv2.imwrite(folderpath + '/' + str(ind) + '-processed.png', image)
146
+ data_df['Pg Ind'] = data_df['Pg Ind'].astype('int')
147
+ data_df.to_csv(folderpath +'/data.csv', index=False)
148
+
Manager.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import Parser
2
+ import ImageProcessor
3
+ import pickle
4
+
5
+ def run(foldername):
6
+ ImageProcessor.process_file('PDF Cases/' + foldername)
7
+ C = Parser.run("PDF Cases/" + foldername)
8
+ with open('PDF Cases/' + foldername + '/processed.pkl', 'wb') as outp:
9
+ pickle.dump(C, outp, pickle.HIGHEST_PROTOCOL)
10
+
Parser.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import numpy as np
3
+ import os
4
+ import pandas as pd
5
+ import re
6
+ import datetime
7
+ import pytesseract
8
+ import cv2
9
+ import warnings
10
+ import ocrmypdf
11
+ import spacy
12
+ import dateparser
13
+ import statistics
14
+ from statistics import mode
15
+ from textblob import Word
16
+ from Levenshtein import distance
17
+
18
+ nlp = spacy.load('en_core_web_trf')
19
+
20
+ def parse_doc(folderpath):
21
+ doc = fitz.open(folderpath + '/opinion.pdf')
22
+ header_texts, body_texts, footer_texts = [], [], []
23
+ paginated_dict = {}
24
+ for (i, p) in enumerate(doc):
25
+ ht, bt, ft = parse_page(folderpath, i)
26
+ if "preliminary print" in ht.lower(): # Skip cover page
27
+ continue
28
+ body_texts.append(bt)
29
+ header_texts.append(ht)
30
+ footer_texts.append(ft)
31
+ paginated_dict[i] = (ht, bt, ft)
32
+ return header_texts, body_texts, footer_texts, paginated_dict
33
+
34
+
35
+ def parse_page(folderpath, pg_ind):
36
+ df = pd.read_csv(folderpath + '/data.csv')
37
+
38
+ header_text, body_text, footer_text = None, None, None
39
+ page_df = df[df['Pg Ind'] == pg_ind].to_dict('records')[0]
40
+ header = [page_df['Header X1'], page_df['Header Y1'], page_df['Header X2'], page_df['Header Y2']]
41
+ body = [page_df['Body X1'], page_df['Body Y1'], page_df['Body X2'], page_df['Body Y2']]
42
+ footer = [page_df['Footer X1'], page_df['Footer Y1'], page_df['Footer X2'], page_df['Footer Y2']]
43
+ case_split = page_df['Case Separator Y']
44
+ body_rect = fitz.Rect(body[0], body[1], body[2], body[3])
45
+ header_rect = fitz.Rect(header[0], header[1], header[2], header[3])
46
+ footer_rect = fitz.Rect(footer[0], footer[1], footer[2], footer[3])
47
+
48
+ doc = fitz.open(folderpath + '/opinion.pdf')
49
+ page = doc[pg_ind]
50
+ header_text = page.get_text("text", clip=header_rect).strip().replace('Page Proof Pending Publication', '')
51
+ body_text = page.get_text("text", clip=body_rect).strip().replace('Page Proof Pending Publication', '')
52
+ if str(footer_rect[0]) != "nan":
53
+ footer_text = page.get_text("text", clip=footer_rect).strip().replace('Page Proof Pending Publication', '')
54
+ return header_text, body_text, footer_text
55
+
56
+
57
+ def get_splits(folderpath):
58
+ header_texts, body_texts, footer_texts, paginated_dict = parse_doc(folderpath)
59
+ full_body_text = "\n".join(body_texts).replace('-', '')
60
+ full_body_text = correct(full_body_text, "justice")
61
+
62
+ split_p = re.compile(
63
+ '((\n|^)\s*Per Curiam\.\s*\n)|((\n|^)\s*(Mr\.\s*)?Justice[A-z\s\n,]*delivered the opinion)|((\n|^)\s*(mr\.\s*)?justice[A-Za-z\n\s,–-]*(concurring|dissenting)[A-Za-z\n\s,–]*\.)',
64
+ re.IGNORECASE)
65
+ splits_m = list(re.finditer(split_p, full_body_text))
66
+ splits = []
67
+
68
+ if len(splits_m) > 0:
69
+ print("---Found split---")
70
+ i = 0
71
+ while i <= len(splits_m):
72
+ if i == 0:
73
+ start = 0
74
+ else:
75
+ start = splits_m[i - 1].span()[0]
76
+ if i == len(splits_m):
77
+ splits.append(full_body_text[start:].strip())
78
+ else:
79
+ splits.append(full_body_text[start:splits_m[i].span()[0]].strip())
80
+ i = i + 1
81
+ return splits, paginated_dict
82
+
83
+
84
+ def get_split_data(split):
85
+ txt = split[0:300]
86
+ d = nlp(txt)
87
+ first_sent = list(d.sents)[0]
88
+ first_sent_text = " ".join([t.text for t in first_sent])
89
+ ents = nlp(first_sent_text).ents
90
+ person_ents = [e.text.lower().split('tice')[-1].strip().capitalize() for e in ents if e.label_ == "PERSON"]
91
+ if "the chief justice" in first_sent_text:
92
+ person_ents.append("Chief")
93
+ opinion_type, author, joining = None, None, []
94
+ if "delivered" in first_sent_text:
95
+ author = person_ents[0]
96
+ joining = []
97
+ opinion_type = "majority"
98
+ if "per curiam" in first_sent_text.lower():
99
+ author = "Per Curiam"
100
+ joining = []
101
+ opinion_type = "majority"
102
+ if "concurring" in first_sent_text:
103
+ author = person_ents[0]
104
+ joining = person_ents[1:]
105
+ opinion_type = "concurrence"
106
+ if "dissenting" in first_sent_text:
107
+ author = person_ents[0]
108
+ joining = person_ents[1:]
109
+ opinion_type = "dissent"
110
+ if opinion_type == None:
111
+ opinion_type = "pre"
112
+ return opinion_type, author, joining
113
+
114
+ def court_from_year(date_time):
115
+ df = pd.read_csv('Justices Table.csv')
116
+ justice_dict = df.to_dict('records')
117
+ court_year = {'Associate':[], 'Chief': None}
118
+ for j in justice_dict:
119
+ start = datetime.datetime.strptime(j['Start'], '%Y-%m-%d')
120
+ if str(j['End']) != "nan":
121
+ end = datetime.datetime.strptime(j['End'], '%Y-%m-%d')
122
+ else:
123
+ end = datetime.datetime.now()
124
+ if date_time > start and date_time < end:
125
+ name = j['Name']
126
+ if "Associate" in name:
127
+ court_year['Associate'].append(name.split('(Associate Justice)')[0].split(', ')[0].strip().split(' ')[-1])
128
+ if "Chief" in name:
129
+ court_year['Chief'] = name.split('(Chief Justice)')[0].split(', ')[0].strip()
130
+ return court_year
131
+
132
+ def correct(corpus, keyword):
133
+ words = corpus.split(' ')
134
+ potential_targets = []
135
+ for (i, w) in enumerate(words):
136
+ d = distance(keyword, w.lower())
137
+ if d < 2 and d > 0:
138
+ potential_targets.append((i, w))
139
+
140
+ for (ind, pt) in potential_targets:
141
+ word = Word(pt.lower())
142
+ result = word.spellcheck()
143
+ if result[0][1] > 0.9 and result[0][0].lower() != pt.lower():
144
+ if "\n" in pt:
145
+ words[ind] = "\n" + result[0][0]
146
+ else:
147
+ words[ind] = result[0][0]
148
+ return " ".join(words)
149
+
150
+ class Opinion:
151
+ def __init__(self, opinion_type, author, joining, body_text, fn_text, header_text):
152
+ self.opinion_type = opinion_type
153
+ self.author = author
154
+ self.joining = joining
155
+ self.body_text = body_text
156
+ self.fn_text = fn_text
157
+ self.header_text = header_text
158
+
159
+ class Case:
160
+ def __init__(self, paginated_dict):
161
+ self.paginated_dict = paginated_dict
162
+ self.majority, self.concurrences, self.dissents, self.pre = None, [], [], None
163
+ self.date, self.case_name, self.case_citation, self.page_numbers = None, "", None, []
164
+ self.recused = []
165
+ self.cert_info = None
166
+
167
+ def get_date(self):
168
+ print("Extracting Date")
169
+ doc = nlp(self.pre.body_text[0:2000])
170
+ sents = list(doc.sents)
171
+ for s in sents:
172
+ if "Decided" in s.text:
173
+ date_extract = s.text.replace('\n', '').split('Decided')[-1].strip().replace('.', '')
174
+ pattern = re.compile('Decided\s*\w*\s*[0-9]{1,2}, [0-9]{4}')
175
+ match = re.search(pattern, s.text)
176
+ date_extract = s.text[match.span()[0]:match.span()[1]].split('Decided')[-1].strip()
177
+ date = datetime.datetime.strptime(date_extract, '%B %d, %Y')
178
+ self.date = date
179
+ return
180
+
181
+ def update_recused(self):
182
+ print("Identifying recused")
183
+ p = re.compile('(?:justice )[\w\s]*(?: took no part)', re.IGNORECASE)
184
+ m = re.search(p, self.majority.body_text)
185
+ if m is not None:
186
+ recused_span = self.majority.body_text[m.span()[0]:m.span()[1]].lower()
187
+ doc = nlp(recused_span)
188
+ self.recused = [e.text.split('justice')[-1].upper().strip().capitalize() for e in doc.ents if
189
+ e.label_ == "PERSON"]
190
+ if "chief justice" in recused_span:
191
+ self.recused.append("Chief")
192
+
193
+ def update_majority_joining(self):
194
+ print("Getting updated list")
195
+ cy = court_from_year(self.date)
196
+ known = [j for d in self.dissents for j in d.joining] + [d.author for d in self.dissents] + [j for c in
197
+ self.concurrences
198
+ for j in
199
+ c.joining] + [
200
+ c.author for c in self.concurrences] + [self.majority.author] + [r for r in self.recused]
201
+ all_justices = [aj for aj in cy['Associate']]
202
+ if cy['Chief'] is not None:
203
+ all_justices.append('Chief')
204
+ self.majority.joining = [aj for aj in all_justices if aj not in known]
205
+
206
+ def get_cert_info(self):
207
+ print("Extracting Cert Info")
208
+ lines = self.pre.body_text.split('\n')
209
+ start = -1
210
+ end = -1
211
+ for (i, l) in enumerate(lines):
212
+ if "petition" in l.lower() or "cert" in l.lower() or "appeals" in l.lower() or "on" in l.lower().split(' '):
213
+ start = i
214
+ if "no." in l.lower() or "no.s" in l.lower() or "argued" in l.lower() or "decided" in l.lower():
215
+ end = i
216
+ break
217
+ self.cert_info = " ".join(lines[start:end]).strip().upper().replace(' ', ' ').replace('.', '')
218
+
219
+ def get_case_name_cite_pns(self):
220
+ lines_total = [l for p in self.paginated_dict for l in self.paginated_dict[p][0].split('\n')[:-1]]
221
+ lines_selected = []
222
+ p = re.compile('(october|per curiam|opinion of|concur|dissent|statement of|argument|syllabus|[0-9] ?U.)', re.IGNORECASE)
223
+ for l in lines_total:
224
+ m = re.search(p, l)
225
+ if m is None and not l.lower().strip().isnumeric():
226
+ lines_selected.append(l)
227
+ self.case_name = mode(lines_selected)
228
+
229
+ p = re.compile('[0-9]*\s?U\.\s?S\. ?([0-9]|_)*', re.IGNORECASE)
230
+ lines_selected = []
231
+ for l in lines_total:
232
+ m = re.search(p, l)
233
+ if m is not None:
234
+ self.case_citation = l[m.span()[0]:m.span()[1]]
235
+ break
236
+
237
+ p = re.compile('^\s?[0-9]+\s?$', re.IGNORECASE)
238
+ page_lines = [self.paginated_dict[p][0].split('\n') for p in self.paginated_dict]
239
+ self.page_numbers = []
240
+ for pl in page_lines:
241
+ numeric_on_page = []
242
+ for l in pl:
243
+ matches = list(re.finditer(p, l))
244
+ for m in matches:
245
+ possibility = int(l[m.span()[0]:m.span()[1]].strip())
246
+ numeric_on_page.append(possibility)
247
+ if len(numeric_on_page) == 0:
248
+ if len(self.page_numbers) > 0:
249
+ self.page_numbers.append(self.page_numbers[-1] + 1)
250
+ else:
251
+ self.page_numbers.append(1)
252
+ if len(numeric_on_page) > 0:
253
+ page_number = max(numeric_on_page)
254
+ if len(self.page_numbers) > 0:
255
+ page_number = max(page_number, self.page_numbers[-1] + 1)
256
+ self.page_numbers.append(page_number)
257
+
258
+ if self.case_citation is not None and self.case_citation.lower().split('s.')[-1].strip() == "":
259
+ self.case_citation = self.case_citation.strip() + ' ' + str(self.page_numbers[0])
260
+
261
+ def process(self):
262
+ self.get_date()
263
+ self.update_recused()
264
+ self.update_majority_joining()
265
+ self.get_cert_info()
266
+ self.get_case_name_cite_pns()
267
+
268
+ def run(folderpath):
269
+ splits, paginated_dict = get_splits(folderpath)
270
+ C = Case(paginated_dict=paginated_dict)
271
+ ops = []
272
+ for s in splits:
273
+ opinion_type, author, joining = get_split_data(s)
274
+ if opinion_type is not None:
275
+ op = Opinion(opinion_type, author, joining, s, fn_text=None, header_text=None)
276
+ if opinion_type == "majority":
277
+ C.majority = op
278
+ if opinion_type == "concurrence":
279
+ C.concurrences.append(op)
280
+ if opinion_type == "dissent":
281
+ C.dissents.append(op)
282
+ if opinion_type == "pre":
283
+ C.pre = op
284
+ ops.append(op)
285
+
286
+ C.process()
287
+ return C
Scraper.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import re
3
+ import requests
4
+ import os
5
+
6
+ def download_slip(link):
7
+ r = requests.get("https://www.supremecourt.gov" + link, stream=True)
8
+ base = link.split('/')[-1].split('.pdf')[0]
9
+ if not os.path.isdir('PDF Cases/' + base):
10
+ os.mkdir('PDF Cases/' + base)
11
+ name = 'PDF Cases/' + base + '/' + "opinion.pdf"
12
+ with open(name, 'wb') as f:
13
+ for chunk in r.iter_content(chunk_size=1024):
14
+ f.write(chunk)
15
+
16
+ def download_loc(link):
17
+ base = link.split('/')[-1].split('.pdf')[0]
18
+ volume = int(base.split('usrep')[-1][0:3])
19
+ page = int(base.split('usrep')[-1][3:])
20
+ foldername = str(volume) + '_' + str(page)
21
+ r = requests.get(link, stream=True)
22
+ if not os.path.isdir('PDF Cases/' + foldername):
23
+ os.mkdir('PDF Cases/' + foldername)
24
+ name = 'PDF Cases/' + foldername + '/' + "opinion.pdf"
25
+ with open(name, 'wb') as f:
26
+ for chunk in r.iter_content(chunk_size=1024):
27
+ f.write(chunk)
28
+
29
+
30
+ def slip_pipeline(year):
31
+ page = requests.get("https://www.supremecourt.gov/opinions/slipopinion/" + str(year))
32
+ soup = BeautifulSoup(page.text)
33
+ html_links = soup.findAll('div', attrs={'id': 'accordion'})[0].findAll('a')
34
+ links = []
35
+ for link in html_links:
36
+ if ".pdf" in link.get('href').lower() and "new" not in link.get('href') and "diff" not in link.get('href'):
37
+ links.append(link.get('href'))
38
+
39
+ for l in links:
40
+ download_slip(l)