cools commited on
Commit
ca87ae5
·
1 Parent(s): 0c734ac

Upload Tagger

Browse files

Tags (and visualizes) various aspects of the opinions

Files changed (1) hide show
  1. Tagger.py +73 -0
Tagger.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file tags the major text
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers.util import cos_sim
7
+ import cv2
8
+
9
+ def get_paragraphed_text(folderpath):
10
+ paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
11
+ paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
12
+ paras_text = []
13
+ for (i, para) in enumerate(paras_lines):
14
+ para_lines = [l[-1].strip() for l in para]
15
+ paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
16
+ return paras_text
17
+
18
+
19
+ def semantic_match(template, corpus):
20
+ model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
21
+ embs = model.encode(corpus)
22
+ if type(template) == list:
23
+ template_emb = model.encode(template)
24
+ else:
25
+ template_emb = model.encode([template])
26
+ scores = cos_sim(embs, template_emb)
27
+ return np.argmax(scores), max(scores)
28
+
29
+ def get_majority_author_sentence(paras_text):
30
+ template = "justice x delivered the opinion of the court"
31
+ all_sents = []
32
+ paras_inds = []
33
+ for (i,pt) in enumerate(paras_text):
34
+ sents = re.split('\.|:', pt)
35
+ for s in sents:
36
+ all_sents.append(s.lower())
37
+ paras_inds.append(i)
38
+ ind, score = semantic_match(template, all_sents)
39
+ if score > 0.6:
40
+ return all_sents[ind], paras_inds[ind], score
41
+ else:
42
+ template = "per curiam"
43
+ ind, score = semantic_match(template, all_sents)
44
+ if score > 0.6:
45
+ return all_sents[ind], paras_inds[ind], score
46
+ else:
47
+ raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
48
+
49
+ def draw_line_above_sent(folderpath, sent, para_ind):
50
+ data_df = pd.read_csv(folderpath + '/data.csv')
51
+ paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
52
+ para_lines = eval(paras_df['Lines'].tolist()[para_ind])
53
+ text_lines = []
54
+
55
+ for (i, l) in enumerate(para_lines):
56
+ pg_ind, line_ind, _, text = l
57
+ text_lines.append(text)
58
+
59
+ ind, score = semantic_match(sent, text_lines)
60
+ pg_ind, line_ind, _, text = para_lines[ind]
61
+ line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
62
+ line_bbox = line_data[0:-1]
63
+ image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
64
+ image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), (0, 0, 0), 3)
65
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
66
+
67
+ def process_file(folderpath, draw=False):
68
+ paras_text = get_paragraphed_text(folderpath)
69
+ majority_author_sent, majority_author_para_ind, score = get_majority_author_sentence(paras_text)
70
+ if draw:
71
+ draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
72
+
73
+ process_file("PDF Cases/462_122", draw=True)