Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

cools commited on Aug 6, 2023

Commit

107f597

1 Parent(s): fb4beeb

Delete Tagger.py

Browse files

Files changed (1) hide show

Tagger.py +0 -142

Tagger.py DELETED Viewed

@@ -1,142 +0,0 @@
-# This file tags the major text
-import pandas as pd
-import numpy as np
-import re
-from sentence_transformers import SentenceTransformer
-from sentence_transformers.util import cos_sim
-import cv2
-import nltk
-nltk.download('punkt')
-from nltk.tokenize import sent_tokenize, word_tokenize
-model = SentenceTransformer('all-mpnet-base-v2')
-def get_paragraphed_text(folderpath):
-    paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
-    paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
-    paras_text = []
-    for (i, para) in enumerate(paras_lines):
-        para_lines = [l[-1].strip() for l in para]
-        paras_text.append(" ".join(para_lines).strip().replace('  ', ' '))
-    return paras_text
-def semantic_match(template, corpus):
-    embs = model.encode(corpus)
-    if type(template) == list:
-        template_emb = model.encode(template)
-    else:
-        template_emb = model.encode([template])
-    scores = cos_sim(embs, template_emb)
-    return np.argmax(scores), max(scores)
-def get_majority_author_sentence(paras_text):
-    for (i,pt) in enumerate(paras_text):
-        sents = sent_tokenize(pt)
-        for (j,s) in enumerate(sents):
-            s = s.lower()
-            if ("justice" in s and "opinion" in s and "court" in s and ("deliver" in s or "announc" in s)):
-                if j != 0 and j != len(sents)-1:
-                    print("Located, but not within first or last paragraph")
-                return [s, i, 0]
-        for (j,s) in enumerate(sents): # Per curiam
-            s = s.lower()
-            if ("per" in s and "curiam" in s):
-                if j != 0 and j != len(sents)-1:
-                    print("Located, but not within first or last paragraph")
-                return [s, i, 0]
-    raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
-def get_other_justices_sentences(paras_text, ind_maj):
-    data = {}
-    counter = 0
-    data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
-    for (i,pt) in enumerate(paras_text):
-        if i < ind_maj:
-            continue
-        sents = sent_tokenize(pt)
-        for (j,s) in enumerate(sents):
-            s = s.lower()
-            if "justice" in s:
-                if re.search(',\s?concurring', s) is not None and re.search('\([A-z,\s]*concurring[A-z,\s]*\)', s) is None: # Regex catches 'Justice (concurring...)'
-                    counter += 1
-                    last = "C"
-                    data['Concurrences'].append((s,i,counter))
-                elif (re.search(',\s?dissenting', s) or "dissent" in s[-9:].strip()) and re.search('\([A-z,\s]*dissenting[A-z,\s]*\)', s) is None:
-                    counter += 1
-                    data['Dissents'].append((s,i,counter))
-                    last = "D"
-                elif "join" in s and s.index('join') > s.index('justice') and len(s.split(' ')) < 15:
-                    counter += 1
-                    if last == "C":
-                        data['Concurrences'].append((s,i,counter))
-                    if last == "D":
-                        data['Dissents'].append((s,i,counter))
-                if "took no part" in s: # This may not be triggered as often?
-                    counter += 1
-                    data['Recused'].append((s,i,counter))
-    return data
-def split(paras_text, maj, other_data):
-    opinions = []
-    opinions.append(('Majority', maj[0], maj[1], maj[2]))
-    for c in other_data['Concurrences']:
-        opinions.append(('Concurrence', c[0], c[1], c[2]))
-    for d in other_data['Dissents']:
-        opinions.append(('Dissent', d[0], d[1], d[2]))
-    for r in other_data['Recused']:
-        opinions.append(('Recused', r[0], r[1], r[2]))
-    opinions_data = []
-    opinions = np.array(opinions)
-    order = opinions[:, 3].astype(int)
-    opinions = opinions[order.argsort()]
-    for (i, opinion) in enumerate(opinions):
-        if i == len(opinions) - 1:
-            end_ind = len(paras_text)
-        else:
-            end_ind = int(opinions[i + 1][2])  # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
-        start_ind = int(opinion[2])
-        if end_ind == start_ind:
-            end_ind += 1
-        o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
-        o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
-        opinions_data.append(o)
-    opinions_df = pd.DataFrame(data=opinions_data)
-    return opinions_df
-def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
-    data_df = pd.read_csv(folderpath + '/data.csv')
-    paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
-    para_lines = eval(paras_df['Lines'].tolist()[para_ind])
-    text_lines = []
-    for (i, l) in enumerate(para_lines):
-        pg_ind, line_ind, _, text = l
-        text_lines.append(text)
-    ind, score = semantic_match(sent, text_lines)
-    pg_ind, line_ind, _, text = para_lines[ind]
-    line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
-    line_bbox = line_data[0:-1]
-    image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
-    image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
-    cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
-def process_file(folderpath, draw=False):
-    paras_text = get_paragraphed_text(folderpath)
-    maj = get_majority_author_sentence(paras_text)
-    other_data = get_other_justices_sentences(paras_text, maj[1])
-    opinions_df = split(paras_text, maj, other_data)
-    opinions_df.to_csv(folderpath + '/opinions.csv', index=False)
-    if draw:
-        draw_line_above_sent(folderpath, maj[0], maj[1])
-        for c in other_data['Concurrences']:
-            draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
-        for d in other_data['Dissents']:
-            draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))