Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

cools commited on May 26, 2023

Commit

30420b9

1 Parent(s): 96dd787

Upload 4 files

Browse files

Files changed (4) hide show

ImageProcessor.py +148 -0
Manager.py +10 -0
Parser.py +287 -0
Scraper.py +40 -0

ImageProcessor.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import cv2
+import fitz
+import numpy as np
+import os
+import pandas as pd
+import pytesseract
+import warnings
+def pdf2png(folderpath):
+    doc = fitz.open(folderpath + '/opinion.pdf')
+    zoom = 1
+    mat = fitz.Matrix(zoom, zoom)
+    for (i, p) in enumerate(doc):
+        pix = p.get_pixmap(matrix=mat)
+        pix.save(folderpath + '/' + str(i) + '.png')
+def get_footnote_bbox(filename):
+    footnotes_bbox = (None, None, None, None)
+    x1p, y1p, x2p, y2p = get_page_bbox(filename)
+    x1h, y1h, x2h, y2h = get_header_bbox(filename)
+    image = cv2.imread(filename)
+    im_h, im_w, im_d = image.shape
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    thresh = cv2.threshold(gray, 240, 255, cv2.THRESH_BINARY_INV)[1]
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
+    dilate = cv2.dilate(thresh, kernel, iterations=1)
+    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
+    for (i, c) in enumerate(cnts):
+        x, y, w, h = cv2.boundingRect(c)
+        if h < 7 and w > 50 and y > y1p and x - x1p < 30:
+            footnotes_bbox = (x, y, x2p, y2p)
+    return footnotes_bbox
+def get_header_bbox(filename):
+    image = cv2.imread(filename)
+    im_h, im_w, im_d = image.shape
+    base_image = image.copy()
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    blur = cv2.GaussianBlur(gray, (9,9), 0)
+    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (200,10))
+    dilate = cv2.dilate(thresh, kernel, iterations=1)
+    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
+    for (i,c) in enumerate(cnts):
+        x,y,w,h = cv2.boundingRect(c)
+        break
+    header_bbox = (x, y, x+w, y+40)
+    # header_bbox = (145, 45, 465, 155) # For digitized variants
+    return header_bbox
+def get_page_bbox(filename):
+    image = cv2.imread(filename)
+    im_h, im_w, im_d = image.shape
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    blur = cv2.GaussianBlur(gray, (7, 7), 0)
+    thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 10))
+    dilate = cv2.dilate(thresh, kernel, iterations=1)
+    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
+    header_bbox = get_header_bbox(filename)
+    all_x1 = [cv2.boundingRect(c)[0] for c in cnts]
+    all_y1 = [cv2.boundingRect(c)[1] for c in cnts]
+    all_x2 = [cv2.boundingRect(c)[0] + cv2.boundingRect(c)[2] for c in cnts]
+    all_y2 = [cv2.boundingRect(c)[1] + cv2.boundingRect(c)[3] for c in cnts]
+    return min(all_x1), header_bbox[1], max(all_x2), max(all_y2)
+def get_case_separator(filename):
+    new_case_line = (None, None, None, None)
+    x1p, y1p, x2p, y2p = get_page_bbox(filename)
+    x1h, y1h, x2h, y2h = get_header_bbox(filename)
+    image = cv2.imread(filename)
+    im_h, im_w, im_d = image.shape
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    blur = cv2.GaussianBlur(gray, (7, 7), 0)
+    thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV)[1]
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
+    dilate = cv2.dilate(thresh, kernel, iterations=1)
+    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
+    for (i, c) in enumerate(cnts):
+        x, y, w, h = cv2.boundingRect(c)
+        x_center = (x1p + x2p) / 2
+        if h < 8 and w > 70 and ((x - x1p) < x_center and (x - x1p) > 0.3 * x_center) and (y > y1p and y > y1h):  #
+            new_case_line = (x1p, y, x2p, y)
+            break
+    return new_case_line
+def get_page_elements(filename):
+    page_bbox = get_page_bbox(filename)
+    header_bbox = get_header_bbox(filename)
+    fn_bbox = get_footnote_bbox(filename)
+    case_separator_bbox = get_case_separator(filename)
+    if fn_bbox[0] is not None:
+        body_bbox = (page_bbox[0], header_bbox[3], page_bbox[2], fn_bbox[1])
+    else:
+        body_bbox = (page_bbox[0], header_bbox[3], page_bbox[2], page_bbox[3])
+    image = cv2.imread(filename)
+    cv2.rectangle(image, (page_bbox[0], page_bbox[1]), (page_bbox[2], page_bbox[3]), (0, 0, 0), 4)
+    cv2.rectangle(image, (header_bbox[0], header_bbox[1]), (header_bbox[2], header_bbox[3]), (0, 255, 0), 2)
+    cv2.rectangle(image, (body_bbox[0], body_bbox[1]), (body_bbox[2], body_bbox[3]), (255, 0, 0), 2)
+    if fn_bbox[0] is not None:
+        cv2.rectangle(image, (fn_bbox[0], fn_bbox[1]), (fn_bbox[2], fn_bbox[3]), (0, 0, 255), 2)
+    if case_separator_bbox[0] is not None:
+        cv2.rectangle(image, (case_separator_bbox[0], case_separator_bbox[1]),
+                      (case_separator_bbox[2], case_separator_bbox[3]), (255, 0, 255), 2)
+    return page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, image
+def process_file(folderpath):
+    pdf2png(folderpath)
+    files = [f for f in os.listdir(folderpath) if '.png' in f.lower() and "processed" not in f.lower()]
+    data = {'Pg Ind':[],
+            'Header X1':[], 'Header Y1': [], 'Header X2': [], 'Header Y2':[],
+            'Body X1':[], 'Body Y1': [], 'Body X2': [], 'Body Y2':[],
+            'Footer X1':[], 'Footer Y1': [], 'Footer X2': [], 'Footer Y2':[],
+            'Page X1':[], 'Page Y1': [], 'Page X2': [], 'Page Y2':[],
+            'Case Separator Y': [],
+           }
+    data_df = pd.DataFrame(data)
+    for (i,f) in enumerate(files):
+        page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, image = get_page_elements(folderpath +'/' + f)
+        ind = int(f.split('.png')[0])
+        row = {'Pg Ind':[ind],
+                'Header X1':[header_bbox[0]], 'Header Y1': [header_bbox[1]], 'Header X2': [header_bbox[2]], 'Header Y2':[header_bbox[3]],
+                'Body X1':[body_bbox[0]], 'Body Y1': [body_bbox[1]], 'Body X2': [body_bbox[2]], 'Body Y2':[body_bbox[3]],
+                'Footer X1':[fn_bbox[0]], 'Footer Y1': [fn_bbox[1]], 'Footer X2': [fn_bbox[2]], 'Footer Y2':[fn_bbox[3]],
+                'Page X1':[page_bbox[0]], 'Page Y1': [page_bbox[1]], 'Page X2': [page_bbox[2]], 'Page Y2':[page_bbox[3]],
+                'Case Separator Y': [case_separator_bbox[1]]
+               }
+        row_df = pd.DataFrame(row)
+        data_df = pd.concat([data_df, row_df], ignore_index=True)
+        cv2.imwrite(folderpath + '/' + str(ind) + '-processed.png', image)
+    data_df['Pg Ind'] = data_df['Pg Ind'].astype('int')
+    data_df.to_csv(folderpath +'/data.csv', index=False)

Manager.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import Parser
+import ImageProcessor
+import pickle
+def run(foldername):
+    ImageProcessor.process_file('PDF Cases/' + foldername)
+    C = Parser.run("PDF Cases/" + foldername)
+    with open('PDF Cases/' + foldername + '/processed.pkl', 'wb') as outp:
+        pickle.dump(C, outp, pickle.HIGHEST_PROTOCOL)

Parser.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import fitz
+import numpy as np
+import os
+import pandas as pd
+import re
+import datetime
+import pytesseract
+import cv2
+import warnings
+import ocrmypdf
+import spacy
+import dateparser
+import statistics
+from statistics import mode
+from textblob import Word
+from Levenshtein import distance
+nlp = spacy.load('en_core_web_trf')
+def parse_doc(folderpath):
+    doc = fitz.open(folderpath + '/opinion.pdf')
+    header_texts, body_texts, footer_texts = [], [], []
+    paginated_dict = {}
+    for (i, p) in enumerate(doc):
+        ht, bt, ft = parse_page(folderpath, i)
+        if "preliminary print" in ht.lower():  # Skip cover page
+            continue
+        body_texts.append(bt)
+        header_texts.append(ht)
+        footer_texts.append(ft)
+        paginated_dict[i] = (ht, bt, ft)
+    return header_texts, body_texts, footer_texts, paginated_dict
+def parse_page(folderpath, pg_ind):
+    df = pd.read_csv(folderpath + '/data.csv')
+    header_text, body_text, footer_text = None, None, None
+    page_df = df[df['Pg Ind'] == pg_ind].to_dict('records')[0]
+    header = [page_df['Header X1'], page_df['Header Y1'], page_df['Header X2'], page_df['Header Y2']]
+    body = [page_df['Body X1'], page_df['Body Y1'], page_df['Body X2'], page_df['Body Y2']]
+    footer = [page_df['Footer X1'], page_df['Footer Y1'], page_df['Footer X2'], page_df['Footer Y2']]
+    case_split = page_df['Case Separator Y']
+    body_rect = fitz.Rect(body[0], body[1], body[2], body[3])
+    header_rect = fitz.Rect(header[0], header[1], header[2], header[3])
+    footer_rect = fitz.Rect(footer[0], footer[1], footer[2], footer[3])
+    doc = fitz.open(folderpath + '/opinion.pdf')
+    page = doc[pg_ind]
+    header_text = page.get_text("text", clip=header_rect).strip().replace('Page Proof Pending Publication', '')
+    body_text = page.get_text("text", clip=body_rect).strip().replace('Page Proof Pending Publication', '')
+    if str(footer_rect[0]) != "nan":
+        footer_text = page.get_text("text", clip=footer_rect).strip().replace('Page Proof Pending Publication', '')
+    return header_text, body_text, footer_text
+def get_splits(folderpath):
+    header_texts, body_texts, footer_texts, paginated_dict = parse_doc(folderpath)
+    full_body_text = "\n".join(body_texts).replace('-', '')
+    full_body_text = correct(full_body_text, "justice")
+    split_p = re.compile(
+        '((\n|^)\s*Per Curiam\.\s*\n)|((\n|^)\s*(Mr\.\s*)?Justice[A-z\s\n,]*delivered the opinion)|((\n|^)\s*(mr\.\s*)?justice[A-Za-z\n\s,–-]*(concurring|dissenting)[A-Za-z\n\s,–]*\.)',
+        re.IGNORECASE)
+    splits_m = list(re.finditer(split_p, full_body_text))
+    splits = []
+    if len(splits_m) > 0:
+        print("---Found split---")
+        i = 0
+        while i <= len(splits_m):
+            if i == 0:
+                start = 0
+            else:
+                start = splits_m[i - 1].span()[0]
+            if i == len(splits_m):
+                splits.append(full_body_text[start:].strip())
+            else:
+                splits.append(full_body_text[start:splits_m[i].span()[0]].strip())
+            i = i + 1
+    return splits, paginated_dict
+def get_split_data(split):
+    txt = split[0:300]
+    d = nlp(txt)
+    first_sent = list(d.sents)[0]
+    first_sent_text = " ".join([t.text for t in first_sent])
+    ents = nlp(first_sent_text).ents
+    person_ents = [e.text.lower().split('tice')[-1].strip().capitalize() for e in ents if e.label_ == "PERSON"]
+    if "the chief justice" in first_sent_text:
+        person_ents.append("Chief")
+    opinion_type, author, joining = None, None, []
+    if "delivered" in first_sent_text:
+        author = person_ents[0]
+        joining = []
+        opinion_type = "majority"
+    if "per curiam" in first_sent_text.lower():
+        author = "Per Curiam"
+        joining = []
+        opinion_type = "majority"
+    if "concurring" in first_sent_text:
+        author = person_ents[0]
+        joining = person_ents[1:]
+        opinion_type = "concurrence"
+    if "dissenting" in first_sent_text:
+        author = person_ents[0]
+        joining = person_ents[1:]
+        opinion_type = "dissent"
+    if opinion_type == None:
+        opinion_type = "pre"
+    return opinion_type, author, joining
+def court_from_year(date_time):
+    df = pd.read_csv('Justices Table.csv')
+    justice_dict = df.to_dict('records')
+    court_year = {'Associate':[], 'Chief': None}
+    for j in justice_dict:
+        start = datetime.datetime.strptime(j['Start'], '%Y-%m-%d')
+        if str(j['End']) != "nan":
+            end = datetime.datetime.strptime(j['End'], '%Y-%m-%d')
+        else:
+            end = datetime.datetime.now()
+        if date_time > start and date_time < end:
+            name = j['Name']
+            if "Associate" in name:
+                court_year['Associate'].append(name.split('(Associate Justice)')[0].split(', ')[0].strip().split(' ')[-1])
+            if "Chief" in name:
+                court_year['Chief'] = name.split('(Chief Justice)')[0].split(', ')[0].strip()
+    return court_year
+def correct(corpus, keyword):
+    words = corpus.split(' ')
+    potential_targets = []
+    for (i, w) in enumerate(words):
+        d = distance(keyword, w.lower())
+        if d < 2 and d > 0:
+            potential_targets.append((i, w))
+    for (ind, pt) in potential_targets:
+        word = Word(pt.lower())
+        result = word.spellcheck()
+        if result[0][1] > 0.9 and result[0][0].lower() != pt.lower():
+            if "\n" in pt:
+                words[ind] = "\n" + result[0][0]
+            else:
+                words[ind] = result[0][0]
+    return " ".join(words)
+class Opinion:
+    def __init__(self, opinion_type, author, joining, body_text, fn_text, header_text):
+        self.opinion_type = opinion_type
+        self.author = author
+        self.joining = joining
+        self.body_text = body_text
+        self.fn_text = fn_text
+        self.header_text = header_text
+class Case:
+    def __init__(self, paginated_dict):
+        self.paginated_dict = paginated_dict
+        self.majority, self.concurrences, self.dissents, self.pre = None, [], [], None
+        self.date, self.case_name, self.case_citation, self.page_numbers = None, "", None, []
+        self.recused = []
+        self.cert_info = None
+    def get_date(self):
+        print("Extracting Date")
+        doc = nlp(self.pre.body_text[0:2000])
+        sents = list(doc.sents)
+        for s in sents:
+            if "Decided" in s.text:
+                date_extract = s.text.replace('\n', '').split('Decided')[-1].strip().replace('.', '')
+                pattern = re.compile('Decided\s*\w*\s*[0-9]{1,2}, [0-9]{4}')
+                match = re.search(pattern, s.text)
+                date_extract = s.text[match.span()[0]:match.span()[1]].split('Decided')[-1].strip()
+                date = datetime.datetime.strptime(date_extract, '%B %d, %Y')
+                self.date = date
+        return
+    def update_recused(self):
+        print("Identifying recused")
+        p = re.compile('(?:justice )[\w\s]*(?: took no part)', re.IGNORECASE)
+        m = re.search(p, self.majority.body_text)
+        if m is not None:
+            recused_span = self.majority.body_text[m.span()[0]:m.span()[1]].lower()
+            doc = nlp(recused_span)
+            self.recused = [e.text.split('justice')[-1].upper().strip().capitalize() for e in doc.ents if
+                            e.label_ == "PERSON"]
+            if "chief justice" in recused_span:
+                self.recused.append("Chief")
+    def update_majority_joining(self):
+        print("Getting updated list")
+        cy = court_from_year(self.date)
+        known = [j for d in self.dissents for j in d.joining] + [d.author for d in self.dissents] + [j for c in
+                                                                                                     self.concurrences
+                                                                                                     for j in
+                                                                                                     c.joining] + [
+                    c.author for c in self.concurrences] + [self.majority.author] + [r for r in self.recused]
+        all_justices = [aj for aj in cy['Associate']]
+        if cy['Chief'] is not None:
+            all_justices.append('Chief')
+        self.majority.joining = [aj for aj in all_justices if aj not in known]
+    def get_cert_info(self):
+        print("Extracting Cert Info")
+        lines = self.pre.body_text.split('\n')
+        start = -1
+        end = -1
+        for (i, l) in enumerate(lines):
+            if "petition" in l.lower() or "cert" in l.lower() or "appeals" in l.lower() or "on" in l.lower().split(' '):
+                start = i
+            if "no." in l.lower() or "no.s" in l.lower() or "argued" in l.lower() or "decided" in l.lower():
+                end = i
+                break
+        self.cert_info = " ".join(lines[start:end]).strip().upper().replace('  ', ' ').replace('.', '')
+    def get_case_name_cite_pns(self):
+        lines_total = [l for p in self.paginated_dict for l in self.paginated_dict[p][0].split('\n')[:-1]]
+        lines_selected = []
+        p = re.compile('(october|per curiam|opinion of|concur|dissent|statement of|argument|syllabus|[0-9] ?U.)', re.IGNORECASE)
+        for l in lines_total:
+            m = re.search(p, l)
+            if m is None and not l.lower().strip().isnumeric():
+                lines_selected.append(l)
+        self.case_name = mode(lines_selected)
+        p = re.compile('[0-9]*\s?U\.\s?S\. ?([0-9]|_)*', re.IGNORECASE)
+        lines_selected = []
+        for l in lines_total:
+            m = re.search(p, l)
+            if m is not None:
+                self.case_citation = l[m.span()[0]:m.span()[1]]
+                break
+        p = re.compile('^\s?[0-9]+\s?$', re.IGNORECASE)
+        page_lines = [self.paginated_dict[p][0].split('\n') for p in self.paginated_dict]
+        self.page_numbers = []
+        for pl in page_lines:
+            numeric_on_page = []
+            for l in pl:
+                matches = list(re.finditer(p, l))
+                for m in matches:
+                    possibility = int(l[m.span()[0]:m.span()[1]].strip())
+                    numeric_on_page.append(possibility)
+            if len(numeric_on_page) == 0:
+                if len(self.page_numbers) > 0:
+                    self.page_numbers.append(self.page_numbers[-1] + 1)
+                else:
+                    self.page_numbers.append(1)
+            if len(numeric_on_page) > 0:
+                page_number = max(numeric_on_page)
+                if len(self.page_numbers) > 0:
+                    page_number = max(page_number, self.page_numbers[-1] + 1)
+                self.page_numbers.append(page_number)
+        if self.case_citation is not None and self.case_citation.lower().split('s.')[-1].strip() == "":
+            self.case_citation = self.case_citation.strip() + ' ' + str(self.page_numbers[0])
+    def process(self):
+        self.get_date()
+        self.update_recused()
+        self.update_majority_joining()
+        self.get_cert_info()
+        self.get_case_name_cite_pns()
+def run(folderpath):
+    splits, paginated_dict = get_splits(folderpath)
+    C = Case(paginated_dict=paginated_dict)
+    ops = []
+    for s in splits:
+        opinion_type, author, joining = get_split_data(s)
+        if opinion_type is not None:
+            op = Opinion(opinion_type, author, joining, s, fn_text=None, header_text=None)
+            if opinion_type == "majority":
+                C.majority = op
+            if opinion_type == "concurrence":
+                C.concurrences.append(op)
+            if opinion_type == "dissent":
+                C.dissents.append(op)
+            if opinion_type == "pre":
+                C.pre = op
+            ops.append(op)
+    C.process()
+    return C

Scraper.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from bs4 import BeautifulSoup
+import re
+import requests
+import os
+def download_slip(link):
+    r = requests.get("https://www.supremecourt.gov" + link, stream=True)
+    base = link.split('/')[-1].split('.pdf')[0]
+    if not os.path.isdir('PDF Cases/' + base):
+        os.mkdir('PDF Cases/' + base)
+    name = 'PDF Cases/' + base + '/' + "opinion.pdf"
+    with open(name, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            f.write(chunk)
+def download_loc(link):
+    base = link.split('/')[-1].split('.pdf')[0]
+    volume = int(base.split('usrep')[-1][0:3])
+    page = int(base.split('usrep')[-1][3:])
+    foldername = str(volume) + '_' + str(page)
+    r = requests.get(link, stream=True)
+    if not os.path.isdir('PDF Cases/' + foldername):
+        os.mkdir('PDF Cases/' + foldername)
+    name = 'PDF Cases/' + foldername + '/' + "opinion.pdf"
+    with open(name, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            f.write(chunk)
+def slip_pipeline(year):
+    page = requests.get("https://www.supremecourt.gov/opinions/slipopinion/" + str(year))
+    soup = BeautifulSoup(page.text)
+    html_links = soup.findAll('div', attrs={'id': 'accordion'})[0].findAll('a')
+    links = []
+    for link in html_links:
+        if ".pdf" in link.get('href').lower() and "new" not in link.get('href') and "diff" not in link.get('href'):
+            links.append(link.get('href'))
+    for l in links:
+        download_slip(l)