| | |
| | import pandas as pd |
| | import numpy as np |
| | import re |
| | from sentence_transformers import SentenceTransformer |
| | from sentence_transformers.util import cos_sim |
| | import cv2 |
| | import nltk |
| | nltk.download('punkt') |
| | from nltk.tokenize import sent_tokenize, word_tokenize |
| |
|
| |
|
| | model = SentenceTransformer('all-mpnet-base-v2') |
| |
|
| | def get_paragraphed_text(folderpath): |
| | paras_df = pd.read_csv(folderpath + '/paragraphs.csv') |
| | paras_lines = [eval(p) for p in paras_df['Lines'].tolist()] |
| | paras_text = [] |
| | for (i, para) in enumerate(paras_lines): |
| | para_lines = [l[-1].strip() for l in para] |
| | paras_text.append(" ".join(para_lines).strip().replace(' ', ' ')) |
| | return paras_text |
| |
|
| | def semantic_match(template, corpus): |
| | embs = model.encode(corpus) |
| | if type(template) == list: |
| | template_emb = model.encode(template) |
| | else: |
| | template_emb = model.encode([template]) |
| | scores = cos_sim(embs, template_emb) |
| | return np.argmax(scores), max(scores) |
| |
|
| | def get_majority_author_sentence(paras_text): |
| | for (i,pt) in enumerate(paras_text): |
| | sents = sent_tokenize(pt) |
| | for (j,s) in enumerate(sents): |
| | s = s.lower() |
| | if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)): |
| | if j != 0 and j != len(sents)-1: |
| | print("Located, but not within first or last paragraph") |
| | return [s, i, 0] |
| |
|
| | for (j,s) in enumerate(sents): |
| | s = s.lower() |
| | if ("per" in s and "curiam" in s): |
| | if j != 0 and j != len(sents)-1: |
| | print("Located, but not within first or last paragraph") |
| | return [s, i, 0] |
| | raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph") |
| |
|
| | def get_other_justice_sentences(paras_text, ind_maj): |
| | data = {} |
| | counter = 0 |
| | data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None |
| | for (i,pt) in enumerate(paras_text): |
| | if i < ind_maj: |
| | continue |
| | sents = sent_tokenize(pt) |
| | for (j,s) in enumerate(sents): |
| | s = s.lower() |
| | if "justice" in s: |
| | if ("concurring" in s and "," in s): |
| | counter += 1 |
| | last = "C" |
| | data['Concurrences'].append((s,i,counter)) |
| | elif ("dissenting" in s and "," in s): |
| | counter += 1 |
| | data['Dissents'].append((s,i,counter)) |
| | last = "D" |
| | elif "join" in s: |
| | counter += 1 |
| | if last == "C": |
| | data['Concurrences'].append((s,i,counter)) |
| | if last == "D": |
| | data['Dissents'].append((s,i,counter)) |
| | if "took no part" in s: |
| | counter += 1 |
| | data['Recused'].append((s,i, counter)) |
| | return data |
| |
|
| | def split(paras_text, maj, other_data): |
| | opinions = [] |
| | opinions.append(('Majority', maj[0], maj[1], maj[2])) |
| | for c in other_data['Concurrences']: |
| | opinions.append(('Concurrence', c[0], c[1], c[2])) |
| | for d in other_data['Dissents']: |
| | opinions.append(('Dissent', d[0], d[1], d[2])) |
| | for r in other_data['Recused']: |
| | opinions.append(('Recused', r[0], r[1], r[2])) |
| |
|
| | opinions_data = [] |
| | opinions = np.array(opinions) |
| | order = opinions[:, 3].astype(int) |
| | opinions = opinions[order.argsort()] |
| | for (i, opinion) in enumerate(opinions): |
| | if i == len(opinions) - 1: |
| | end_ind = len(paras_text) |
| | else: |
| | end_ind = int(opinions[i + 1][ |
| | 2]) |
| | start_ind = int(opinion[2]) |
| | o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind} |
| | o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind]) |
| | opinions_data.append(o) |
| | opinions_df = pd.DataFrame(data=opinions_data) |
| | return opinions_df |
| |
|
| | def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)): |
| | data_df = pd.read_csv(folderpath + '/data.csv') |
| | paras_df = pd.read_csv(folderpath + '/paragraphs.csv') |
| | para_lines = eval(paras_df['Lines'].tolist()[para_ind]) |
| | text_lines = [] |
| |
|
| | for (i, l) in enumerate(para_lines): |
| | pg_ind, line_ind, _, text = l |
| | text_lines.append(text) |
| |
|
| | ind, score = semantic_match(sent, text_lines) |
| | pg_ind, line_ind, _, text = para_lines[ind] |
| | line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind] |
| | line_bbox = line_data[0:-1] |
| | image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png') |
| | image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2) |
| | cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image) |
| |
|
| |
|
| | def process_file(folderpath, draw=False): |
| | paras_text = get_paragraphed_text(folderpath) |
| | maj = get_majority_author_sentence(paras_text) |
| | other_data = get_other_justice_sentences(paras_text, maj[1]) |
| | opinions_df = split(paras_text, maj, other_data) |
| | opinions_df.to_csv(folderpath + '/opinions.csv', index=False) |
| |
|
| | if draw: |
| | draw_line_above_sent(folderpath, maj[0], maj[1]) |
| | for c in other_data['Concurrences']: |
| | draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0)) |
| | for d in other_data['Dissents']: |
| | draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100)) |
| |
|
| |
|
| |
|