Upload Tagger
Browse filesTags (and visualizes) various aspects of the opinions
Tagger.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file tags the major text
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import re
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from sentence_transformers.util import cos_sim
|
| 7 |
+
import cv2
|
| 8 |
+
|
| 9 |
+
def get_paragraphed_text(folderpath):
|
| 10 |
+
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 11 |
+
paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
|
| 12 |
+
paras_text = []
|
| 13 |
+
for (i, para) in enumerate(paras_lines):
|
| 14 |
+
para_lines = [l[-1].strip() for l in para]
|
| 15 |
+
paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
|
| 16 |
+
return paras_text
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def semantic_match(template, corpus):
|
| 20 |
+
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
| 21 |
+
embs = model.encode(corpus)
|
| 22 |
+
if type(template) == list:
|
| 23 |
+
template_emb = model.encode(template)
|
| 24 |
+
else:
|
| 25 |
+
template_emb = model.encode([template])
|
| 26 |
+
scores = cos_sim(embs, template_emb)
|
| 27 |
+
return np.argmax(scores), max(scores)
|
| 28 |
+
|
| 29 |
+
def get_majority_author_sentence(paras_text):
|
| 30 |
+
template = "justice x delivered the opinion of the court"
|
| 31 |
+
all_sents = []
|
| 32 |
+
paras_inds = []
|
| 33 |
+
for (i,pt) in enumerate(paras_text):
|
| 34 |
+
sents = re.split('\.|:', pt)
|
| 35 |
+
for s in sents:
|
| 36 |
+
all_sents.append(s.lower())
|
| 37 |
+
paras_inds.append(i)
|
| 38 |
+
ind, score = semantic_match(template, all_sents)
|
| 39 |
+
if score > 0.6:
|
| 40 |
+
return all_sents[ind], paras_inds[ind], score
|
| 41 |
+
else:
|
| 42 |
+
template = "per curiam"
|
| 43 |
+
ind, score = semantic_match(template, all_sents)
|
| 44 |
+
if score > 0.6:
|
| 45 |
+
return all_sents[ind], paras_inds[ind], score
|
| 46 |
+
else:
|
| 47 |
+
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 48 |
+
|
| 49 |
+
def draw_line_above_sent(folderpath, sent, para_ind):
|
| 50 |
+
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 51 |
+
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 52 |
+
para_lines = eval(paras_df['Lines'].tolist()[para_ind])
|
| 53 |
+
text_lines = []
|
| 54 |
+
|
| 55 |
+
for (i, l) in enumerate(para_lines):
|
| 56 |
+
pg_ind, line_ind, _, text = l
|
| 57 |
+
text_lines.append(text)
|
| 58 |
+
|
| 59 |
+
ind, score = semantic_match(sent, text_lines)
|
| 60 |
+
pg_ind, line_ind, _, text = para_lines[ind]
|
| 61 |
+
line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
|
| 62 |
+
line_bbox = line_data[0:-1]
|
| 63 |
+
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 64 |
+
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), (0, 0, 0), 3)
|
| 65 |
+
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 66 |
+
|
| 67 |
+
def process_file(folderpath, draw=False):
|
| 68 |
+
paras_text = get_paragraphed_text(folderpath)
|
| 69 |
+
majority_author_sent, majority_author_para_ind, score = get_majority_author_sentence(paras_text)
|
| 70 |
+
if draw:
|
| 71 |
+
draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
|
| 72 |
+
|
| 73 |
+
process_file("PDF Cases/462_122", draw=True)
|