cools commited on
Commit
7437b7b
·
1 Parent(s): ee033d3

Updated Tagger to better tokenize sentences, and also to fix bug

Browse files
Files changed (1) hide show
  1. Tagger.py +7 -4
Tagger.py CHANGED
@@ -1,9 +1,11 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  import re
4
  from sentence_transformers import SentenceTransformer
5
  from sentence_transformers.util import cos_sim
6
  import cv2
 
7
 
8
  model = SentenceTransformer('all-mpnet-base-v2')
9
 
@@ -44,19 +46,20 @@ def get_majority_author_sentence(paras_text):
44
  all_sents = []
45
  paras_inds = []
46
  for (i, pt) in enumerate(paras_text):
47
- sents = re.split('\.|:', pt)
 
48
  for s in sents:
49
  all_sents.append(s.lower())
50
  paras_inds.append(i)
51
  sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
52
  sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
53
  kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
54
- kw_pc_ind, kw_pc_score = keyword_match(['per curiam'], corpus=all_sents)
55
 
56
  if sem_ind == kw_ind: # Definitely true
57
  return all_sents[sem_ind], paras_inds[sem_ind]
58
  if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
59
- return all_sents[sem_pc_ind], paras[sem_pc_ind]
60
  raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
61
 
62
  def draw_line_above_sent(folderpath, sent, para_ind):
@@ -81,4 +84,4 @@ def process_file(folderpath, draw=False):
81
  paras_text = get_paragraphed_text(folderpath)
82
  majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
83
  if draw:
84
- draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
 
1
+ # This file tags the major text
2
  import pandas as pd
3
  import numpy as np
4
  import re
5
  from sentence_transformers import SentenceTransformer
6
  from sentence_transformers.util import cos_sim
7
  import cv2
8
+ from nltk.tokenize import sent_tokenize, word_tokenize
9
 
10
  model = SentenceTransformer('all-mpnet-base-v2')
11
 
 
46
  all_sents = []
47
  paras_inds = []
48
  for (i, pt) in enumerate(paras_text):
49
+ # sents = re.split('\.|:',pt)
50
+ sents = sent_tokenize(pt)
51
  for s in sents:
52
  all_sents.append(s.lower())
53
  paras_inds.append(i)
54
  sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
55
  sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
56
  kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
57
+ kw_pc_ind, kw_pc_score = keyword_match(['per', 'curiam'], corpus=all_sents)
58
 
59
  if sem_ind == kw_ind: # Definitely true
60
  return all_sents[sem_ind], paras_inds[sem_ind]
61
  if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
62
+ return all_sents[sem_pc_ind], paras_inds[sem_pc_ind]
63
  raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
64
 
65
  def draw_line_above_sent(folderpath, sent, para_ind):
 
84
  paras_text = get_paragraphed_text(folderpath)
85
  majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
86
  if draw:
87
+ draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)