cools commited on
Commit
3aa9c84
·
1 Parent(s): ab52aeb

Update Tagger.py

Browse files
Files changed (1) hide show
  1. Tagger.py +28 -15
Tagger.py CHANGED
@@ -1,4 +1,3 @@
1
- # This file tags the major text
2
  import pandas as pd
3
  import numpy as np
4
  import re
@@ -6,6 +5,8 @@ from sentence_transformers import SentenceTransformer
6
  from sentence_transformers.util import cos_sim
7
  import cv2
8
 
 
 
9
  def get_paragraphed_text(folderpath):
10
  paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
11
  paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
@@ -17,7 +18,6 @@ def get_paragraphed_text(folderpath):
17
 
18
 
19
  def semantic_match(template, corpus):
20
- model = SentenceTransformer('all-mpnet-base-v2')
21
  embs = model.encode(corpus)
22
  if type(template) == list:
23
  template_emb = model.encode(template)
@@ -26,25 +26,38 @@ def semantic_match(template, corpus):
26
  scores = cos_sim(embs, template_emb)
27
  return np.argmax(scores), max(scores)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_majority_author_sentence(paras_text):
30
- template = "justice x delivered the opinion of the court"
31
  all_sents = []
32
  paras_inds = []
33
- for (i,pt) in enumerate(paras_text):
34
  sents = re.split('\.|:', pt)
35
  for s in sents:
36
  all_sents.append(s.lower())
37
  paras_inds.append(i)
38
- ind, score = semantic_match(template, all_sents)
39
- if score > 0.65:
40
- return all_sents[ind], paras_inds[ind], score
41
- else:
42
- template = "per curiam"
43
- ind, score = semantic_match(template, all_sents)
44
- if score > 0.6:
45
- return all_sents[ind], paras_inds[ind], score
46
- else:
47
- raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
48
 
49
  def draw_line_above_sent(folderpath, sent, para_ind):
50
  data_df = pd.read_csv(folderpath + '/data.csv')
@@ -66,6 +79,6 @@ def draw_line_above_sent(folderpath, sent, para_ind):
66
 
67
  def process_file(folderpath, draw=False):
68
  paras_text = get_paragraphed_text(folderpath)
69
- majority_author_sent, majority_author_para_ind, score = get_majority_author_sentence(paras_text)
70
  if draw:
71
  draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import re
 
5
  from sentence_transformers.util import cos_sim
6
  import cv2
7
 
8
+ model = SentenceTransformer('all-mpnet-base-v2')
9
+
10
  def get_paragraphed_text(folderpath):
11
  paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
12
  paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
 
18
 
19
 
20
  def semantic_match(template, corpus):
 
21
  embs = model.encode(corpus)
22
  if type(template) == list:
23
  template_emb = model.encode(template)
 
26
  scores = cos_sim(embs, template_emb)
27
  return np.argmax(scores), max(scores)
28
 
29
+
30
+ def keyword_match(keywords, corpus):
31
+ scores = []
32
+ for (i, c) in enumerate(corpus):
33
+ precision = sum(word.lower() in keywords for word in c.split(' ')) / len(c.split(' '))
34
+ recall = sum(k in c.split(' ') for k in keywords) / len(keywords)
35
+ if (precision + recall) == 0:
36
+ f1 = 0
37
+ else:
38
+ f1 = 2 * precision * recall / (precision + recall)
39
+ scores.append(f1)
40
+ return np.argmax(scores), max(scores)
41
+
42
+
43
  def get_majority_author_sentence(paras_text):
 
44
  all_sents = []
45
  paras_inds = []
46
+ for (i, pt) in enumerate(paras_text):
47
  sents = re.split('\.|:', pt)
48
  for s in sents:
49
  all_sents.append(s.lower())
50
  paras_inds.append(i)
51
+ sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
52
+ sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
53
+ kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
54
+ kw_pc_ind, kw_pc_score = keyword_match(['per curiam'], corpus=all_sents)
55
+
56
+ if sem_ind == kw_ind: # Definitely true
57
+ return all_sents[sem_ind], paras_inds[sem_ind]
58
+ if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
59
+ return all_sents[sem_pc_ind], paras[sem_pc_ind]
60
+ raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
61
 
62
  def draw_line_above_sent(folderpath, sent, para_ind):
63
  data_df = pd.read_csv(folderpath + '/data.csv')
 
79
 
80
  def process_file(folderpath, draw=False):
81
  paras_text = get_paragraphed_text(folderpath)
82
+ majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
83
  if draw:
84
  draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)