Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

cools commited on Jul 14, 2023

Commit

ca87ae5

1 Parent(s): 0c734ac

Upload Tagger

Browse files

Tags (and visualizes) various aspects of the opinions

Files changed (1) hide show

Tagger.py +73 -0

Tagger.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# This file tags the major text
+import pandas as pd
+import numpy as np
+import re
+from sentence_transformers import SentenceTransformer
+from sentence_transformers.util import cos_sim
+import cv2
+def get_paragraphed_text(folderpath):
+    paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
+    paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
+    paras_text = []
+    for (i, para) in enumerate(paras_lines):
+        para_lines = [l[-1].strip() for l in para]
+        paras_text.append(" ".join(para_lines).strip().replace('  ', ' '))
+    return paras_text
+def semantic_match(template, corpus):
+    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+    embs = model.encode(corpus)
+    if type(template) == list:
+        template_emb = model.encode(template)
+    else:
+        template_emb = model.encode([template])
+    scores = cos_sim(embs, template_emb)
+    return np.argmax(scores), max(scores)
+def get_majority_author_sentence(paras_text):
+    template = "justice x delivered the opinion of the court"
+    all_sents = []
+    paras_inds = []
+    for (i,pt) in enumerate(paras_text):
+        sents = re.split('\.|:', pt)
+        for s in sents:
+            all_sents.append(s.lower())
+            paras_inds.append(i)
+    ind, score = semantic_match(template, all_sents)
+    if score > 0.6:
+        return all_sents[ind], paras_inds[ind], score
+    else:
+        template = "per curiam"
+        ind, score = semantic_match(template, all_sents)
+        if score > 0.6:
+            return all_sents[ind], paras_inds[ind], score
+        else:
+            raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
+def draw_line_above_sent(folderpath, sent, para_ind):
+    data_df = pd.read_csv(folderpath + '/data.csv')
+    paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
+    para_lines = eval(paras_df['Lines'].tolist()[para_ind])
+    text_lines = []
+    for (i, l) in enumerate(para_lines):
+        pg_ind, line_ind, _, text = l
+        text_lines.append(text)
+    ind, score = semantic_match(sent, text_lines)
+    pg_ind, line_ind, _, text = para_lines[ind]
+    line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
+    line_bbox = line_data[0:-1]
+    image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
+    image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), (0, 0, 0), 3)
+    cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
+def process_file(folderpath, draw=False):
+    paras_text = get_paragraphed_text(folderpath)
+    majority_author_sent, majority_author_para_ind, score = get_majority_author_sentence(paras_text)
+    if draw:
+        draw_line_above_sent(folderpath, majority_author_sent, majority_author_para_ind)
+process_file("PDF Cases/462_122", draw=True)