Gideon / Tagger.py
cools's picture
Update Tagger.py
2c92324
raw
history blame
5.65 kB
# This file tags the major text
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import cv2
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
model = SentenceTransformer('all-mpnet-base-v2')
def get_paragraphed_text(folderpath):
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
paras_text = []
for (i, para) in enumerate(paras_lines):
para_lines = [l[-1].strip() for l in para]
paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
return paras_text
def semantic_match(template, corpus):
embs = model.encode(corpus)
if type(template) == list:
template_emb = model.encode(template)
else:
template_emb = model.encode([template])
scores = cos_sim(embs, template_emb)
return np.argmax(scores), max(scores)
def get_majority_author_sentence(paras_text):
for (i,pt) in enumerate(paras_text):
sents = sent_tokenize(pt)
for (j,s) in enumerate(sents):
s = s.lower()
if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
if j != 0 and j != len(sents)-1:
print("Located, but not within first or last paragraph")
return [s, i, 0]
for (j,s) in enumerate(sents): # Per curiam
s = s.lower()
if ("per" in s and "curiam" in s):
if j != 0 and j != len(sents)-1:
print("Located, but not within first or last paragraph")
return [s, i, 0]
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
def get_other_justice_sentences(paras_text, ind_maj):
data = {}
counter = 0
data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
for (i,pt) in enumerate(paras_text):
if i < ind_maj:
continue
sents = sent_tokenize(pt)
for (j,s) in enumerate(sents):
s = s.lower()
if "justice" in s:
if ("concurring" in s and "," in s):
counter += 1
last = "C"
data['Concurrences'].append((s,i,counter))
elif ("dissenting" in s and "," in s):
counter += 1
data['Dissents'].append((s,i,counter))
last = "D"
elif "join" in s:
counter += 1
if last == "C":
data['Concurrences'].append((s,i,counter))
if last == "D":
data['Dissents'].append((s,i,counter))
if "took no part" in s:
counter += 1
data['Recused'].append((s,i, counter))
return data
def split(paras_text, maj, other_data):
opinions = []
opinions.append(('Majority', maj[0], maj[1], maj[2]))
for c in other_data['Concurrences']:
opinions.append(('Concurrence', c[0], c[1], c[2]))
for d in other_data['Dissents']:
opinions.append(('Dissent', d[0], d[1], d[2]))
for r in other_data['Recused']:
opinions.append(('Recused', r[0], r[1], r[2]))
opinions_data = []
opinions = np.array(opinions)
order = opinions[:, 3].astype(int)
opinions = opinions[order.argsort()]
for (i, opinion) in enumerate(opinions):
if i == len(opinions) - 1:
end_ind = len(paras_text)
else:
end_ind = int(opinions[i + 1][
2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
start_ind = int(opinion[2])
o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
opinions_data.append(o)
opinions_df = pd.DataFrame(data=opinions_data)
return opinions_df
def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
data_df = pd.read_csv(folderpath + '/data.csv')
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
para_lines = eval(paras_df['Lines'].tolist()[para_ind])
text_lines = []
for (i, l) in enumerate(para_lines):
pg_ind, line_ind, _, text = l
text_lines.append(text)
ind, score = semantic_match(sent, text_lines)
pg_ind, line_ind, _, text = para_lines[ind]
line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
line_bbox = line_data[0:-1]
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
def process_file(folderpath, draw=False):
paras_text = get_paragraphed_text(folderpath)
maj = get_majority_author_sentence(paras_text)
other_data = get_other_justice_sentences(paras_text, maj[1])
opinions_df = split(paras_text, maj, other_data)
opinions_df.to_csv(folderpath + '/opinions.csv', index=False)
if draw:
draw_line_above_sent(folderpath, maj[0], maj[1])
for c in other_data['Concurrences']:
draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
for d in other_data['Dissents']:
draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))