Spaces:

cools
/

Gideon

Runtime error

File size: 5,649 Bytes

# This file tags the major text
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import cv2
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize


model = SentenceTransformer('all-mpnet-base-v2')

def get_paragraphed_text(folderpath):
    paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
    paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
    paras_text = []
    for (i, para) in enumerate(paras_lines):
        para_lines = [l[-1].strip() for l in para]
        paras_text.append(" ".join(para_lines).strip().replace('  ', ' '))
    return paras_text

def semantic_match(template, corpus):
    embs = model.encode(corpus)
    if type(template) == list:
        template_emb = model.encode(template)
    else:
        template_emb = model.encode([template])
    scores = cos_sim(embs, template_emb)
    return np.argmax(scores), max(scores)

def get_majority_author_sentence(paras_text):
    for (i,pt) in enumerate(paras_text):
        sents = sent_tokenize(pt)
        for (j,s) in enumerate(sents):
            s = s.lower()
            if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
                if j != 0 and j != len(sents)-1:
                    print("Located, but not within first or last paragraph")
                return [s, i, 0]

        for (j,s) in enumerate(sents): # Per curiam
            s = s.lower()
            if ("per" in s and "curiam" in s):
                if j != 0 and j != len(sents)-1:
                    print("Located, but not within first or last paragraph")
                return [s, i, 0]
    raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")

def get_other_justice_sentences(paras_text, ind_maj):
    data = {}
    counter = 0
    data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
    for (i,pt) in enumerate(paras_text):
        if i < ind_maj:
            continue
        sents = sent_tokenize(pt)
        for (j,s) in enumerate(sents):
            s = s.lower()
            if "justice" in s:
                if ("concurring" in s and "," in s):
                    counter += 1
                    last = "C"
                    data['Concurrences'].append((s,i,counter))
                elif ("dissenting" in s and "," in s):
                    counter += 1
                    data['Dissents'].append((s,i,counter))
                    last = "D"
                elif "join" in s:
                    counter += 1
                    if last == "C":
                        data['Concurrences'].append((s,i,counter))
                    if last == "D":
                        data['Dissents'].append((s,i,counter))
            if "took no part" in s:
                counter += 1
                data['Recused'].append((s,i, counter))
    return data

def split(paras_text, maj, other_data):
    opinions = []
    opinions.append(('Majority', maj[0], maj[1], maj[2]))
    for c in other_data['Concurrences']:
        opinions.append(('Concurrence', c[0], c[1], c[2]))
    for d in other_data['Dissents']:
        opinions.append(('Dissent', d[0], d[1], d[2]))
    for r in other_data['Recused']:
        opinions.append(('Recused', r[0], r[1], r[2]))

    opinions_data = []
    opinions = np.array(opinions)
    order = opinions[:, 3].astype(int)
    opinions = opinions[order.argsort()]
    for (i, opinion) in enumerate(opinions):
        if i == len(opinions) - 1:
            end_ind = len(paras_text)
        else:
            end_ind = int(opinions[i + 1][
                              2])  # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
        start_ind = int(opinion[2])
        o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
        o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
        opinions_data.append(o)
    opinions_df = pd.DataFrame(data=opinions_data)
    return opinions_df

def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
    data_df = pd.read_csv(folderpath + '/data.csv')
    paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
    para_lines = eval(paras_df['Lines'].tolist()[para_ind])
    text_lines = []

    for (i, l) in enumerate(para_lines):
        pg_ind, line_ind, _, text = l
        text_lines.append(text)

    ind, score = semantic_match(sent, text_lines)
    pg_ind, line_ind, _, text = para_lines[ind]
    line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
    line_bbox = line_data[0:-1]
    image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
    image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
    cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)


def process_file(folderpath, draw=False):
    paras_text = get_paragraphed_text(folderpath)
    maj = get_majority_author_sentence(paras_text)
    other_data = get_other_justice_sentences(paras_text, maj[1])
    opinions_df = split(paras_text, maj, other_data)
    opinions_df.to_csv(folderpath + '/opinions.csv', index=False)

    if draw:
        draw_line_above_sent(folderpath, maj[0], maj[1])
        for c in other_data['Concurrences']:
            draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
        for d in other_data['Dissents']:
            draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))