Delete Tagger.py
Browse files
Tagger.py
DELETED
|
@@ -1,142 +0,0 @@
|
|
| 1 |
-
# This file tags the major text
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import numpy as np
|
| 4 |
-
import re
|
| 5 |
-
from sentence_transformers import SentenceTransformer
|
| 6 |
-
from sentence_transformers.util import cos_sim
|
| 7 |
-
import cv2
|
| 8 |
-
import nltk
|
| 9 |
-
nltk.download('punkt')
|
| 10 |
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
model = SentenceTransformer('all-mpnet-base-v2')
|
| 14 |
-
|
| 15 |
-
def get_paragraphed_text(folderpath):
|
| 16 |
-
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 17 |
-
paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
|
| 18 |
-
paras_text = []
|
| 19 |
-
for (i, para) in enumerate(paras_lines):
|
| 20 |
-
para_lines = [l[-1].strip() for l in para]
|
| 21 |
-
paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
|
| 22 |
-
return paras_text
|
| 23 |
-
|
| 24 |
-
def semantic_match(template, corpus):
|
| 25 |
-
embs = model.encode(corpus)
|
| 26 |
-
if type(template) == list:
|
| 27 |
-
template_emb = model.encode(template)
|
| 28 |
-
else:
|
| 29 |
-
template_emb = model.encode([template])
|
| 30 |
-
scores = cos_sim(embs, template_emb)
|
| 31 |
-
return np.argmax(scores), max(scores)
|
| 32 |
-
|
| 33 |
-
def get_majority_author_sentence(paras_text):
|
| 34 |
-
for (i,pt) in enumerate(paras_text):
|
| 35 |
-
sents = sent_tokenize(pt)
|
| 36 |
-
for (j,s) in enumerate(sents):
|
| 37 |
-
s = s.lower()
|
| 38 |
-
if ("justice" in s and "opinion" in s and "court" in s and ("deliver" in s or "announc" in s)):
|
| 39 |
-
if j != 0 and j != len(sents)-1:
|
| 40 |
-
print("Located, but not within first or last paragraph")
|
| 41 |
-
return [s, i, 0]
|
| 42 |
-
|
| 43 |
-
for (j,s) in enumerate(sents): # Per curiam
|
| 44 |
-
s = s.lower()
|
| 45 |
-
if ("per" in s and "curiam" in s):
|
| 46 |
-
if j != 0 and j != len(sents)-1:
|
| 47 |
-
print("Located, but not within first or last paragraph")
|
| 48 |
-
return [s, i, 0]
|
| 49 |
-
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 50 |
-
|
| 51 |
-
def get_other_justices_sentences(paras_text, ind_maj):
|
| 52 |
-
data = {}
|
| 53 |
-
counter = 0
|
| 54 |
-
data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
|
| 55 |
-
for (i,pt) in enumerate(paras_text):
|
| 56 |
-
if i < ind_maj:
|
| 57 |
-
continue
|
| 58 |
-
sents = sent_tokenize(pt)
|
| 59 |
-
for (j,s) in enumerate(sents):
|
| 60 |
-
s = s.lower()
|
| 61 |
-
if "justice" in s:
|
| 62 |
-
if re.search(',\s?concurring', s) is not None and re.search('\([A-z,\s]*concurring[A-z,\s]*\)', s) is None: # Regex catches 'Justice (concurring...)'
|
| 63 |
-
counter += 1
|
| 64 |
-
last = "C"
|
| 65 |
-
data['Concurrences'].append((s,i,counter))
|
| 66 |
-
elif (re.search(',\s?dissenting', s) or "dissent" in s[-9:].strip()) and re.search('\([A-z,\s]*dissenting[A-z,\s]*\)', s) is None:
|
| 67 |
-
counter += 1
|
| 68 |
-
data['Dissents'].append((s,i,counter))
|
| 69 |
-
last = "D"
|
| 70 |
-
elif "join" in s and s.index('join') > s.index('justice') and len(s.split(' ')) < 15:
|
| 71 |
-
counter += 1
|
| 72 |
-
if last == "C":
|
| 73 |
-
data['Concurrences'].append((s,i,counter))
|
| 74 |
-
if last == "D":
|
| 75 |
-
data['Dissents'].append((s,i,counter))
|
| 76 |
-
if "took no part" in s: # This may not be triggered as often?
|
| 77 |
-
counter += 1
|
| 78 |
-
data['Recused'].append((s,i,counter))
|
| 79 |
-
return data
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def split(paras_text, maj, other_data):
|
| 83 |
-
opinions = []
|
| 84 |
-
opinions.append(('Majority', maj[0], maj[1], maj[2]))
|
| 85 |
-
for c in other_data['Concurrences']:
|
| 86 |
-
opinions.append(('Concurrence', c[0], c[1], c[2]))
|
| 87 |
-
for d in other_data['Dissents']:
|
| 88 |
-
opinions.append(('Dissent', d[0], d[1], d[2]))
|
| 89 |
-
for r in other_data['Recused']:
|
| 90 |
-
opinions.append(('Recused', r[0], r[1], r[2]))
|
| 91 |
-
|
| 92 |
-
opinions_data = []
|
| 93 |
-
opinions = np.array(opinions)
|
| 94 |
-
order = opinions[:, 3].astype(int)
|
| 95 |
-
opinions = opinions[order.argsort()]
|
| 96 |
-
for (i, opinion) in enumerate(opinions):
|
| 97 |
-
if i == len(opinions) - 1:
|
| 98 |
-
end_ind = len(paras_text)
|
| 99 |
-
else:
|
| 100 |
-
end_ind = int(opinions[i + 1][2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
|
| 101 |
-
start_ind = int(opinion[2])
|
| 102 |
-
|
| 103 |
-
if end_ind == start_ind:
|
| 104 |
-
end_ind += 1
|
| 105 |
-
o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
|
| 106 |
-
o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
|
| 107 |
-
opinions_data.append(o)
|
| 108 |
-
opinions_df = pd.DataFrame(data=opinions_data)
|
| 109 |
-
return opinions_df
|
| 110 |
-
|
| 111 |
-
def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
|
| 112 |
-
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 113 |
-
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 114 |
-
para_lines = eval(paras_df['Lines'].tolist()[para_ind])
|
| 115 |
-
text_lines = []
|
| 116 |
-
|
| 117 |
-
for (i, l) in enumerate(para_lines):
|
| 118 |
-
pg_ind, line_ind, _, text = l
|
| 119 |
-
text_lines.append(text)
|
| 120 |
-
|
| 121 |
-
ind, score = semantic_match(sent, text_lines)
|
| 122 |
-
pg_ind, line_ind, _, text = para_lines[ind]
|
| 123 |
-
line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
|
| 124 |
-
line_bbox = line_data[0:-1]
|
| 125 |
-
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 126 |
-
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
|
| 127 |
-
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
def process_file(folderpath, draw=False):
|
| 131 |
-
paras_text = get_paragraphed_text(folderpath)
|
| 132 |
-
maj = get_majority_author_sentence(paras_text)
|
| 133 |
-
other_data = get_other_justices_sentences(paras_text, maj[1])
|
| 134 |
-
opinions_df = split(paras_text, maj, other_data)
|
| 135 |
-
opinions_df.to_csv(folderpath + '/opinions.csv', index=False)
|
| 136 |
-
|
| 137 |
-
if draw:
|
| 138 |
-
draw_line_above_sent(folderpath, maj[0], maj[1])
|
| 139 |
-
for c in other_data['Concurrences']:
|
| 140 |
-
draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
|
| 141 |
-
for d in other_data['Dissents']:
|
| 142 |
-
draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|