Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

cools commited on Jul 16, 2023

Commit

5b63bc4

1 Parent(s): 2533b14

Update Tagger.py

Browse files

(1) More robust way to identify the majority, and (2) can detect concurrences & dissents

Files changed (1) hide show

Tagger.py +39 -35

Tagger.py CHANGED Viewed

@@ -20,7 +20,6 @@ def get_paragraphed_text(folderpath):
         paras_text.append(" ".join(para_lines).strip().replace('  ', ' '))
     return paras_text
 def semantic_match(template, corpus):
     embs = model.encode(corpus)
     if type(template) == list:
@@ -30,41 +29,40 @@ def semantic_match(template, corpus):
     scores = cos_sim(embs, template_emb)
     return np.argmax(scores), max(scores)
-def keyword_match(keywords, corpus):
-    scores = []
-    for (i, c) in enumerate(corpus):
-        precision = sum(word.lower() in keywords for word in c.split(' ')) / len(c.split(' '))
-        recall = sum(k in c.split(' ') for k in keywords) / len(keywords)
-        if (precision + recall) == 0:
-            f1 = 0
-        else:
-            f1 = 2 * precision * recall / (precision + recall)
-        scores.append(f1)
-    return np.argmax(scores), max(scores)
 def get_majority_author_sentence(paras_text):
-    all_sents = []
-    paras_inds = []
-    for (i, pt) in enumerate(paras_text):
-        # sents = re.split('\.|:',pt)
         sents = sent_tokenize(pt)
-        for s in sents:
-            all_sents.append(s.lower())
-            paras_inds.append(i)
-    sem_ind, sem_score = semantic_match(template="justice x delivered the opinion of the court", corpus=all_sents)
-    sem_pc_ind, sem_pc_score = semantic_match(template="per curiam", corpus=all_sents)
-    kw_ind, kw_score = keyword_match(['delivered', 'opinion', 'court', 'justice'], corpus=all_sents)
-    kw_pc_ind, kw_pc_score = keyword_match(['per', 'curiam'], corpus=all_sents)
-    if sem_ind == kw_ind:  # Definitely true
-        return all_sents[sem_ind], paras_inds[sem_ind]
-    if sem_pc_ind == kw_pc_ind and (sem_pc_score > sem_score or kw_pc_score > kw_score):
-        return all_sents[sem_pc_ind], paras_inds[sem_pc_ind]
     raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
-def draw_line_above_sent(folderpath, sent, para_ind):
     data_df = pd.read_csv(folderpath + '/data.csv')
     paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
     para_lines = eval(paras_df['Lines'].tolist()[para_ind])
@@ -79,11 +77,17 @@ def draw_line_above_sent(folderpath, sent, para_ind):
     line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
     line_bbox = line_data[0:-1]
     image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
-    image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), (0, 0, 0), 3)
     cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 def process_file(folderpath, draw=False):
     paras_text = get_paragraphed_text(folderpath)
-    majority_author_sent, majority_author_para_ind = get_majority_author_sentence(paras_text)
     if draw:
-        draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)

         paras_text.append(" ".join(para_lines).strip().replace('  ', ' '))
     return paras_text
 def semantic_match(template, corpus):
     embs = model.encode(corpus)
     if type(template) == list:
     scores = cos_sim(embs, template_emb)
     return np.argmax(scores), max(scores)
 def get_majority_author_sentence(paras_text):
+    for (i,pt) in enumerate(paras_text):
         sents = sent_tokenize(pt)
+        for (j,s) in enumerate(sents):
+            s = s.lower()
+            if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
+                if j != 0 and j != len(sents)-1:
+                    print("Located, but not within first or last paragraph")
+                return [s, i]
+        for (j,s) in enumerate(sents):  # Per curiam
+            s = s.lower()
+            if ("per" in s and "curiam" in s):
+                if j != 0 and j != len(sents)-1:
+                    print("Located, but not within first or last paragraph")
+                return [s, i]
     raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
+def get_other_author_sentence(paras_text, ind_maj):
+    data = {}
+    data['Concurrences'], data['Dissents'] = [], []
+    for (i,pt) in enumerate(paras_text):
+        if i < ind_maj:
+            continue
+        sents = sent_tokenize(pt)
+        for (j,s) in enumerate(sents):
+            s = s.lower()
+            if "justice" in s and "," in s:
+                if "concurring" in s:
+                    data['Concurrences'].append((s,i))
+                if "dissenting" in s:
+                    data['Dissents'].append((s,i))
+    return data
+def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
     data_df = pd.read_csv(folderpath + '/data.csv')
     paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
     para_lines = eval(paras_df['Lines'].tolist()[para_ind])
     line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
     line_bbox = line_data[0:-1]
     image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
+    image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), thickness=2)
     cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 def process_file(folderpath, draw=False):
     paras_text = get_paragraphed_text(folderpath)
+    maj = get_majority_author_sentence(paras_text)
+    data = get_other_author_sentence(paras_text, maj[1])
     if draw:
+        draw_line_above_sent(folderpath, maj[0], maj[1])
+        for c in data['Concurrences']:
+            draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
+        for d in data['Dissents']:
+            draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))