Update Tagger.py
Browse files
Tagger.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# This file tags the major text
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import re
|
|
@@ -6,6 +5,8 @@ from sentence_transformers import SentenceTransformer
|
|
| 6 |
from sentence_transformers.util import cos_sim
|
| 7 |
import cv2
|
| 8 |
|
|
|
|
|
|
|
| 9 |
def get_paragraphed_text(folderpath):
|
| 10 |
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 11 |
paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
|
|
@@ -17,7 +18,6 @@ def get_paragraphed_text(folderpath):
|
|
| 17 |
|
| 18 |
|
| 19 |
def semantic_match(template, corpus):
|
| 20 |
-
model = SentenceTransformer('all-mpnet-base-v2')
|
| 21 |
embs = model.encode(corpus)
|
| 22 |
if type(template) == list:
|
| 23 |
template_emb = model.encode(template)
|
|
@@ -26,25 +26,38 @@ def semantic_match(template, corpus):
|
|
| 26 |
scores = cos_sim(embs, template_emb)
|
| 27 |
return np.argmax(scores), max(scores)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def get_majority_author_sentence(paras_text):
|
| 30 |
-
template = "justice x delivered the opinion of the court"
|
| 31 |
all_sents = []
|
| 32 |
paras_inds = []
|
| 33 |
-
for (i,pt) in enumerate(paras_text):
|
| 34 |
sents = re.split('\.|:', pt)
|
| 35 |
for s in sents:
|
| 36 |
all_sents.append(s.lower())
|
| 37 |
paras_inds.append(i)
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
def draw_line_above_sent(folderpath, sent, para_ind):
|
| 50 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
|
@@ -66,6 +79,6 @@ def draw_line_above_sent(folderpath, sent, para_ind):
|
|
| 66 |
|
| 67 |
def process_file(folderpath, draw=False):
|
| 68 |
paras_text = get_paragraphed_text(folderpath)
|
| 69 |
-
majority_author_sent, majority_author_para_ind
|
| 70 |
if draw:
|
| 71 |
draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import re
|
|
|
|
| 5 |
from sentence_transformers.util import cos_sim
|
| 6 |
import cv2
|
| 7 |
|
| 8 |
+
model = SentenceTransformer('all-mpnet-base-v2')
|
| 9 |
+
|
| 10 |
def get_paragraphed_text(folderpath):
|
| 11 |
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 12 |
paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def semantic_match(template, corpus):
|
|
|
|
| 21 |
embs = model.encode(corpus)
|
| 22 |
if type(template) == list:
|
| 23 |
template_emb = model.encode(template)
|
|
|
|
| 26 |
scores = cos_sim(embs, template_emb)
|
| 27 |
return np.argmax(scores), max(scores)
|
| 28 |
|
| 29 |
+
|
| 30 |
+
def keyword_match(keywords, corpus):
|
| 31 |
+
scores = []
|
| 32 |
+
for (i, c) in enumerate(corpus):
|
| 33 |
+
precision = sum(word.lower() in keywords for word in c.split(' ')) / len(c.split(' '))
|
| 34 |
+
recall = sum(k in c.split(' ') for k in keywords) / len(keywords)
|
| 35 |
+
if (precision + recall) == 0:
|
| 36 |
+
f1 = 0
|
| 37 |
+
else:
|
| 38 |
+
f1 = 2 * precision * recall / (precision + recall)
|
| 39 |
+
scores.append(f1)
|
| 40 |
+
return np.argmax(scores), max(scores)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
def get_majority_author_sentence(paras_text):
|
|
|
|
| 44 |
all_sents = []
|
| 45 |
paras_inds = []
|
| 46 |
+
for (i, pt) in enumerate(paras_text):
|
| 47 |
sents = re.split('\.|:', pt)
|
| 48 |
for s in sents:
|
| 49 |
all_sents.append(s.lower())
|
| 50 |
paras_inds.append(i)
|
| 51 |
+
sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
|
| 52 |
+
sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
|
| 53 |
+
kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
|
| 54 |
+
kw_pc_ind, kw_pc_score = keyword_match(['per curiam'], corpus=all_sents)
|
| 55 |
+
|
| 56 |
+
if sem_ind == kw_ind: # Definitely true
|
| 57 |
+
return all_sents[sem_ind], paras_inds[sem_ind]
|
| 58 |
+
if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
|
| 59 |
+
return all_sents[sem_pc_ind], paras[sem_pc_ind]
|
| 60 |
+
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 61 |
|
| 62 |
def draw_line_above_sent(folderpath, sent, para_ind):
|
| 63 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
|
|
|
| 79 |
|
| 80 |
def process_file(folderpath, draw=False):
|
| 81 |
paras_text = get_paragraphed_text(folderpath)
|
| 82 |
+
majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
|
| 83 |
if draw:
|
| 84 |
draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
|