Update Tagger.py
Browse files(1) More robust way to identify the majority, and (2) can detect concurrences & dissents
Tagger.py
CHANGED
|
@@ -20,7 +20,6 @@ def get_paragraphed_text(folderpath):
|
|
| 20 |
paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
|
| 21 |
return paras_text
|
| 22 |
|
| 23 |
-
|
| 24 |
def semantic_match(template, corpus):
|
| 25 |
embs = model.encode(corpus)
|
| 26 |
if type(template) == list:
|
|
@@ -30,41 +29,40 @@ def semantic_match(template, corpus):
|
|
| 30 |
scores = cos_sim(embs, template_emb)
|
| 31 |
return np.argmax(scores), max(scores)
|
| 32 |
|
| 33 |
-
|
| 34 |
-
def keyword_match(keywords, corpus):
|
| 35 |
-
scores = []
|
| 36 |
-
for (i, c) in enumerate(corpus):
|
| 37 |
-
precision = sum(word.lower() in keywords for word in c.split(' ')) / len(c.split(' '))
|
| 38 |
-
recall = sum(k in c.split(' ') for k in keywords) / len(keywords)
|
| 39 |
-
if (precision + recall) == 0:
|
| 40 |
-
f1 = 0
|
| 41 |
-
else:
|
| 42 |
-
f1 = 2 * precision * recall / (precision + recall)
|
| 43 |
-
scores.append(f1)
|
| 44 |
-
return np.argmax(scores), max(scores)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
def get_majority_author_sentence(paras_text):
|
| 48 |
-
|
| 49 |
-
paras_inds = []
|
| 50 |
-
for (i, pt) in enumerate(paras_text):
|
| 51 |
-
# sents = re.split('\.|:',pt)
|
| 52 |
sents = sent_tokenize(pt)
|
| 53 |
-
for s in sents:
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 66 |
|
| 67 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 69 |
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 70 |
para_lines = eval(paras_df['Lines'].tolist()[para_ind])
|
|
@@ -79,11 +77,17 @@ def draw_line_above_sent(folderpath, sent, para_ind):
|
|
| 79 |
line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
|
| 80 |
line_bbox = line_data[0:-1]
|
| 81 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 82 |
-
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]),
|
| 83 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 84 |
|
| 85 |
def process_file(folderpath, draw=False):
|
| 86 |
paras_text = get_paragraphed_text(folderpath)
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
if draw:
|
| 89 |
-
draw_line_above_sent(folderpath,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
|
| 21 |
return paras_text
|
| 22 |
|
|
|
|
| 23 |
def semantic_match(template, corpus):
|
| 24 |
embs = model.encode(corpus)
|
| 25 |
if type(template) == list:
|
|
|
|
| 29 |
scores = cos_sim(embs, template_emb)
|
| 30 |
return np.argmax(scores), max(scores)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def get_majority_author_sentence(paras_text):
|
| 33 |
+
for (i,pt) in enumerate(paras_text):
|
|
|
|
|
|
|
|
|
|
| 34 |
sents = sent_tokenize(pt)
|
| 35 |
+
for (j,s) in enumerate(sents):
|
| 36 |
+
s = s.lower()
|
| 37 |
+
if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
|
| 38 |
+
if j != 0 and j != len(sents)-1:
|
| 39 |
+
print("Located, but not within first or last paragraph")
|
| 40 |
+
return [s, i]
|
| 41 |
+
for (j,s) in enumerate(sents): # Per curiam
|
| 42 |
+
s = s.lower()
|
| 43 |
+
if ("per" in s and "curiam" in s):
|
| 44 |
+
if j != 0 and j != len(sents)-1:
|
| 45 |
+
print("Located, but not within first or last paragraph")
|
| 46 |
+
return [s, i]
|
| 47 |
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 48 |
|
| 49 |
+
def get_other_author_sentence(paras_text, ind_maj):
|
| 50 |
+
data = {}
|
| 51 |
+
data['Concurrences'], data['Dissents'] = [], []
|
| 52 |
+
for (i,pt) in enumerate(paras_text):
|
| 53 |
+
if i < ind_maj:
|
| 54 |
+
continue
|
| 55 |
+
sents = sent_tokenize(pt)
|
| 56 |
+
for (j,s) in enumerate(sents):
|
| 57 |
+
s = s.lower()
|
| 58 |
+
if "justice" in s and "," in s:
|
| 59 |
+
if "concurring" in s:
|
| 60 |
+
data['Concurrences'].append((s,i))
|
| 61 |
+
if "dissenting" in s:
|
| 62 |
+
data['Dissents'].append((s,i))
|
| 63 |
+
return data
|
| 64 |
+
|
| 65 |
+
def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
|
| 66 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 67 |
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
| 68 |
para_lines = eval(paras_df['Lines'].tolist()[para_ind])
|
|
|
|
| 77 |
line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
|
| 78 |
line_bbox = line_data[0:-1]
|
| 79 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 80 |
+
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), thickness=2)
|
| 81 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 82 |
|
| 83 |
def process_file(folderpath, draw=False):
|
| 84 |
paras_text = get_paragraphed_text(folderpath)
|
| 85 |
+
maj = get_majority_author_sentence(paras_text)
|
| 86 |
+
data = get_other_author_sentence(paras_text, maj[1])
|
| 87 |
+
|
| 88 |
if draw:
|
| 89 |
+
draw_line_above_sent(folderpath, maj[0], maj[1])
|
| 90 |
+
for c in data['Concurrences']:
|
| 91 |
+
draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
|
| 92 |
+
for d in data['Dissents']:
|
| 93 |
+
draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))
|