Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

cools commited on Jul 15, 2023

Commit

3aa9c84

1 Parent(s): ab52aeb

Update Tagger.py

Browse files

Files changed (1) hide show

Tagger.py +28 -15

Tagger.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# This file tags the major text
 import pandas as pd
 import numpy as np
 import re
@@ -6,6 +5,8 @@ from sentence_transformers import SentenceTransformer
 from sentence_transformers.util import cos_sim
 import cv2
 def get_paragraphed_text(folderpath):
     paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
     paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
@@ -17,7 +18,6 @@ def get_paragraphed_text(folderpath):
 def semantic_match(template, corpus):
-    model = SentenceTransformer('all-mpnet-base-v2')
     embs = model.encode(corpus)
     if type(template) == list:
         template_emb = model.encode(template)
@@ -26,25 +26,38 @@ def semantic_match(template, corpus):
     scores = cos_sim(embs, template_emb)
     return np.argmax(scores), max(scores)
 def get_majority_author_sentence(paras_text):
-    template = "justice x delivered the opinion of the court"
     all_sents = []
     paras_inds = []
-    for (i,pt) in enumerate(paras_text):
         sents = re.split('\.|:', pt)
         for s in sents:
             all_sents.append(s.lower())
             paras_inds.append(i)
-    ind, score = semantic_match(template, all_sents)
-    if score > 0.65:
-        return all_sents[ind], paras_inds[ind], score
-    else:
-        template = "per curiam"
-        ind, score = semantic_match(template, all_sents)
-        if score > 0.6:
-            return all_sents[ind], paras_inds[ind], score
-        else:
-            raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
 def draw_line_above_sent(folderpath, sent, para_ind):
     data_df = pd.read_csv(folderpath + '/data.csv')
@@ -66,6 +79,6 @@ def draw_line_above_sent(folderpath, sent, para_ind):
 def process_file(folderpath, draw=False):
     paras_text = get_paragraphed_text(folderpath)
-    majority_author_sent, majority_author_para_ind, score = get_majority_author_sentence(paras_text)
     if draw:
         draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)

 import pandas as pd
 import numpy as np
 import re
 from sentence_transformers.util import cos_sim
 import cv2
+model = SentenceTransformer('all-mpnet-base-v2')
 def get_paragraphed_text(folderpath):
     paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
     paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
 def semantic_match(template, corpus):
     embs = model.encode(corpus)
     if type(template) == list:
         template_emb = model.encode(template)
     scores = cos_sim(embs, template_emb)
     return np.argmax(scores), max(scores)
+def keyword_match(keywords, corpus):
+    scores = []
+    for (i, c) in enumerate(corpus):
+        precision = sum(word.lower() in keywords for word in c.split(' ')) / len(c.split(' '))
+        recall = sum(k in c.split(' ') for k in keywords) / len(keywords)
+        if (precision + recall) == 0:
+            f1 = 0
+        else:
+            f1 = 2 * precision * recall / (precision + recall)
+        scores.append(f1)
+    return np.argmax(scores), max(scores)
 def get_majority_author_sentence(paras_text):
     all_sents = []
     paras_inds = []
+    for (i, pt) in enumerate(paras_text):
         sents = re.split('\.|:', pt)
         for s in sents:
             all_sents.append(s.lower())
             paras_inds.append(i)
+    sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
+    sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
+    kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
+    kw_pc_ind, kw_pc_score = keyword_match(['per curiam'], corpus=all_sents)
+    if sem_ind == kw_ind:  # Definitely true
+        return all_sents[sem_ind], paras_inds[sem_ind]
+    if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
+        return all_sents[sem_pc_ind], paras[sem_pc_ind]
+    raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
 def draw_line_above_sent(folderpath, sent, para_ind):
     data_df = pd.read_csv(folderpath + '/data.csv')
 def process_file(folderpath, draw=False):
     paras_text = get_paragraphed_text(folderpath)
+    majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
     if draw:
         draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)