# This file tags the major text import pandas as pd import numpy as np import re from sentence_transformers import SentenceTransformer from sentence_transformers.util import cos_sim import cv2 import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize, word_tokenize model = SentenceTransformer('all-mpnet-base-v2') def get_paragraphed_text(folderpath): paras_df = pd.read_csv(folderpath + '/paragraphs.csv') paras_lines = [eval(p) for p in paras_df['Lines'].tolist()] paras_text = [] for (i, para) in enumerate(paras_lines): para_lines = [l[-1].strip() for l in para] paras_text.append(" ".join(para_lines).strip().replace(' ', ' ')) return paras_text def semantic_match(template, corpus): embs = model.encode(corpus) if type(template) == list: template_emb = model.encode(template) else: template_emb = model.encode([template]) scores = cos_sim(embs, template_emb) return np.argmax(scores), max(scores) def get_majority_author_sentence(paras_text): for (i,pt) in enumerate(paras_text): sents = sent_tokenize(pt) for (j,s) in enumerate(sents): s = s.lower() if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)): if j != 0 and j != len(sents)-1: print("Located, but not within first or last paragraph") return [s, i, 0] for (j,s) in enumerate(sents): # Per curiam s = s.lower() if ("per" in s and "curiam" in s): if j != 0 and j != len(sents)-1: print("Located, but not within first or last paragraph") return [s, i, 0] raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph") def get_other_justice_sentences(paras_text, ind_maj): data = {} counter = 0 data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None for (i,pt) in enumerate(paras_text): if i < ind_maj: continue sents = sent_tokenize(pt) for (j,s) in enumerate(sents): s = s.lower() if "justice" in s: if ("concurring" in s and "," in s): counter += 1 last = "C" data['Concurrences'].append((s,i,counter)) elif ("dissenting" in s and "," in s): counter += 1 data['Dissents'].append((s,i,counter)) last = "D" elif "join" in s: counter += 1 if last == "C": data['Concurrences'].append((s,i,counter)) if last == "D": data['Dissents'].append((s,i,counter)) if "took no part" in s: counter += 1 data['Recused'].append((s,i, counter)) return data def split(paras_text, maj, other_data): opinions = [] opinions.append(('Majority', maj[0], maj[1], maj[2])) for c in other_data['Concurrences']: opinions.append(('Concurrence', c[0], c[1], c[2])) for d in other_data['Dissents']: opinions.append(('Dissent', d[0], d[1], d[2])) for r in other_data['Recused']: opinions.append(('Recused', r[0], r[1], r[2])) opinions_data = [] opinions = np.array(opinions) order = opinions[:, 3].astype(int) opinions = opinions[order.argsort()] for (i, opinion) in enumerate(opinions): if i == len(opinions) - 1: end_ind = len(paras_text) else: end_ind = int(opinions[i + 1][ 2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff? start_ind = int(opinion[2]) o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind} o['Text'] = "".join(paras_text[start_ind:end_ind]) opinions_data.append(o) opinions_df = pd.DataFrame(data=opinions_data) return opinions_df def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)): data_df = pd.read_csv(folderpath + '/data.csv') paras_df = pd.read_csv(folderpath + '/paragraphs.csv') para_lines = eval(paras_df['Lines'].tolist()[para_ind]) text_lines = [] for (i, l) in enumerate(para_lines): pg_ind, line_ind, _, text = l text_lines.append(text) ind, score = semantic_match(sent, text_lines) pg_ind, line_ind, _, text = para_lines[ind] line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind] line_bbox = line_data[0:-1] image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png') image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2) cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image) def process_file(folderpath, draw=False): paras_text = get_paragraphed_text(folderpath) maj = get_majority_author_sentence(paras_text) other_data = get_other_justice_sentences(paras_text, maj[1]) opinions_df = split(paras_text, maj, other_data) opinions_df.to_csv(folderpath + '/opinions.csv', index=False) if draw: draw_line_above_sent(folderpath, maj[0], maj[1]) for c in other_data['Concurrences']: draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0)) for d in other_data['Dissents']: draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))