cools commited on
Commit
5b63bc4
·
1 Parent(s): 2533b14

Update Tagger.py

Browse files

(1) More robust way to identify the majority, and (2) can detect concurrences & dissents

Files changed (1) hide show
  1. Tagger.py +39 -35
Tagger.py CHANGED
@@ -20,7 +20,6 @@ def get_paragraphed_text(folderpath):
20
  paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
21
  return paras_text
22
 
23
-
24
  def semantic_match(template, corpus):
25
  embs = model.encode(corpus)
26
  if type(template) == list:
@@ -30,41 +29,40 @@ def semantic_match(template, corpus):
30
  scores = cos_sim(embs, template_emb)
31
  return np.argmax(scores), max(scores)
32
 
33
-
34
- def keyword_match(keywords, corpus):
35
- scores = []
36
- for (i, c) in enumerate(corpus):
37
- precision = sum(word.lower() in keywords for word in c.split(' ')) / len(c.split(' '))
38
- recall = sum(k in c.split(' ') for k in keywords) / len(keywords)
39
- if (precision + recall) == 0:
40
- f1 = 0
41
- else:
42
- f1 = 2 * precision * recall / (precision + recall)
43
- scores.append(f1)
44
- return np.argmax(scores), max(scores)
45
-
46
-
47
  def get_majority_author_sentence(paras_text):
48
- all_sents = []
49
- paras_inds = []
50
- for (i, pt) in enumerate(paras_text):
51
- # sents = re.split('\.|:',pt)
52
  sents = sent_tokenize(pt)
53
- for s in sents:
54
- all_sents.append(s.lower())
55
- paras_inds.append(i)
56
- sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
57
- sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
58
- kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
59
- kw_pc_ind, kw_pc_score = keyword_match(['per', 'curiam'], corpus=all_sents)
60
-
61
- if sem_ind == kw_ind: # Definitely true
62
- return all_sents[sem_ind], paras_inds[sem_ind]
63
- if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
64
- return all_sents[sem_pc_ind], paras_inds[sem_pc_ind]
65
  raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
66
 
67
- def draw_line_above_sent(folderpath, sent, para_ind):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  data_df = pd.read_csv(folderpath + '/data.csv')
69
  paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
70
  para_lines = eval(paras_df['Lines'].tolist()[para_ind])
@@ -79,11 +77,17 @@ def draw_line_above_sent(folderpath, sent, para_ind):
79
  line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
80
  line_bbox = line_data[0:-1]
81
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
82
- image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), (0, 0, 0), 3)
83
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
84
 
85
  def process_file(folderpath, draw=False):
86
  paras_text = get_paragraphed_text(folderpath)
87
- majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
 
 
88
  if draw:
89
- draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
 
 
 
 
 
20
  paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
21
  return paras_text
22
 
 
23
  def semantic_match(template, corpus):
24
  embs = model.encode(corpus)
25
  if type(template) == list:
 
29
  scores = cos_sim(embs, template_emb)
30
  return np.argmax(scores), max(scores)
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def get_majority_author_sentence(paras_text):
33
+ for (i,pt) in enumerate(paras_text):
 
 
 
34
  sents = sent_tokenize(pt)
35
+ for (j,s) in enumerate(sents):
36
+ s = s.lower()
37
+ if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
38
+ if j != 0 and j != len(sents)-1:
39
+ print("Located, but not within first or last paragraph")
40
+ return [s, i]
41
+ for (j,s) in enumerate(sents): # Per curiam
42
+ s = s.lower()
43
+ if ("per" in s and "curiam" in s):
44
+ if j != 0 and j != len(sents)-1:
45
+ print("Located, but not within first or last paragraph")
46
+ return [s, i]
47
  raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
48
 
49
+ def get_other_author_sentence(paras_text, ind_maj):
50
+ data = {}
51
+ data['Concurrences'], data['Dissents'] = [], []
52
+ for (i,pt) in enumerate(paras_text):
53
+ if i < ind_maj:
54
+ continue
55
+ sents = sent_tokenize(pt)
56
+ for (j,s) in enumerate(sents):
57
+ s = s.lower()
58
+ if "justice" in s and "," in s:
59
+ if "concurring" in s:
60
+ data['Concurrences'].append((s,i))
61
+ if "dissenting" in s:
62
+ data['Dissents'].append((s,i))
63
+ return data
64
+
65
+ def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
66
  data_df = pd.read_csv(folderpath + '/data.csv')
67
  paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
68
  para_lines = eval(paras_df['Lines'].tolist()[para_ind])
 
77
  line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
78
  line_bbox = line_data[0:-1]
79
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
80
+ image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), thickness=2)
81
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
82
 
83
  def process_file(folderpath, draw=False):
84
  paras_text = get_paragraphed_text(folderpath)
85
+ maj = get_majority_author_sentence(paras_text)
86
+ data = get_other_author_sentence(paras_text, maj[1])
87
+
88
  if draw:
89
+ draw_line_above_sent(folderpath, maj[0], maj[1])
90
+ for c in data['Concurrences']:
91
+ draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
92
+ for d in data['Dissents']:
93
+ draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))