Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

cools commited on Jul 15, 2023

Commit

7437b7b

1 Parent(s): ee033d3

Updated Tagger to better tokenize sentences, and also to fix bug

Browse files

Files changed (1) hide show

Tagger.py +7 -4

Tagger.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import pandas as pd
 import numpy as np
 import re
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
 import cv2
 model = SentenceTransformer('all-mpnet-base-v2')
@@ -44,19 +46,20 @@ def get_majority_author_sentence(paras_text):
     all_sents = []
     paras_inds = []
     for (i, pt) in enumerate(paras_text):
-        sents = re.split('\.|:', pt)
         for s in sents:
             all_sents.append(s.lower())
             paras_inds.append(i)
     sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
     sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
     kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
-    kw_pc_ind, kw_pc_score = keyword_match(['per curiam'], corpus=all_sents)
     if sem_ind == kw_ind:  # Definitely true
         return all_sents[sem_ind], paras_inds[sem_ind]
     if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
-        return all_sents[sem_pc_ind], paras[sem_pc_ind]
     raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
 def draw_line_above_sent(folderpath, sent, para_ind):
@@ -81,4 +84,4 @@ def process_file(folderpath, draw=False):
     paras_text = get_paragraphed_text(folderpath)
     majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
     if draw:
-        draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)

+# This file tags the major text
 import pandas as pd
 import numpy as np
 import re
 from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
 import cv2
+from nltk.tokenize import sent_tokenize, word_tokenize
 model = SentenceTransformer('all-mpnet-base-v2')
     all_sents = []
     paras_inds = []
     for (i, pt) in enumerate(paras_text):
+        # sents = re.split('\.|:',pt)
+        sents = sent_tokenize(pt)
         for s in sents:
             all_sents.append(s.lower())
             paras_inds.append(i)
     sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
     sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
     kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
+    kw_pc_ind, kw_pc_score = keyword_match(['per', 'curiam'], corpus=all_sents)
     if sem_ind == kw_ind:  # Definitely true
         return all_sents[sem_ind], paras_inds[sem_ind]
     if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
+        return all_sents[sem_pc_ind], paras_inds[sem_pc_ind]
     raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
 def draw_line_above_sent(folderpath, sent, para_ind):
     paras_text = get_paragraphed_text(folderpath)
     majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
     if draw:
+        draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)