Spaces:

cools
/

Gideon

Runtime error

App Files Files Community

Gideon / Tagger.py

cools

Update Tagger.py

2c92324 over 2 years ago

raw

history blame

5.65 kB

	# This file tags the major text
	import pandas as pd
	import numpy as np
	import re
	from sentence_transformers import SentenceTransformer
	from sentence_transformers.util import cos_sim
	import cv2
	import nltk
	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize, word_tokenize


	model = SentenceTransformer('all-mpnet-base-v2')

	def get_paragraphed_text(folderpath):
	paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
	paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
	paras_text = []
	for (i, para) in enumerate(paras_lines):
	para_lines = [l[-1].strip() for l in para]
	paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
	return paras_text

	def semantic_match(template, corpus):
	embs = model.encode(corpus)
	if type(template) == list:
	template_emb = model.encode(template)
	else:
	template_emb = model.encode([template])
	scores = cos_sim(embs, template_emb)
	return np.argmax(scores), max(scores)

	def get_majority_author_sentence(paras_text):
	for (i,pt) in enumerate(paras_text):
	sents = sent_tokenize(pt)
	for (j,s) in enumerate(sents):
	s = s.lower()
	if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
	if j != 0 and j != len(sents)-1:
	print("Located, but not within first or last paragraph")
	return [s, i, 0]

	for (j,s) in enumerate(sents): # Per curiam
	s = s.lower()
	if ("per" in s and "curiam" in s):
	if j != 0 and j != len(sents)-1:
	print("Located, but not within first or last paragraph")
	return [s, i, 0]
	raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")

	def get_other_justice_sentences(paras_text, ind_maj):
	data = {}
	counter = 0
	data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
	for (i,pt) in enumerate(paras_text):
	if i < ind_maj:
	continue
	sents = sent_tokenize(pt)
	for (j,s) in enumerate(sents):
	s = s.lower()
	if "justice" in s:
	if ("concurring" in s and "," in s):
	counter += 1
	last = "C"
	data['Concurrences'].append((s,i,counter))
	elif ("dissenting" in s and "," in s):
	counter += 1
	data['Dissents'].append((s,i,counter))
	last = "D"
	elif "join" in s:
	counter += 1
	if last == "C":
	data['Concurrences'].append((s,i,counter))
	if last == "D":
	data['Dissents'].append((s,i,counter))
	if "took no part" in s:
	counter += 1
	data['Recused'].append((s,i, counter))
	return data

	def split(paras_text, maj, other_data):
	opinions = []
	opinions.append(('Majority', maj[0], maj[1], maj[2]))
	for c in other_data['Concurrences']:
	opinions.append(('Concurrence', c[0], c[1], c[2]))
	for d in other_data['Dissents']:
	opinions.append(('Dissent', d[0], d[1], d[2]))
	for r in other_data['Recused']:
	opinions.append(('Recused', r[0], r[1], r[2]))

	opinions_data = []
	opinions = np.array(opinions)
	order = opinions[:, 3].astype(int)
	opinions = opinions[order.argsort()]
	for (i, opinion) in enumerate(opinions):
	if i == len(opinions) - 1:
	end_ind = len(paras_text)
	else:
	end_ind = int(opinions[i + 1][
	2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
	start_ind = int(opinion[2])
	o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
	o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
	opinions_data.append(o)
	opinions_df = pd.DataFrame(data=opinions_data)
	return opinions_df

	def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
	data_df = pd.read_csv(folderpath + '/data.csv')
	paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
	para_lines = eval(paras_df['Lines'].tolist()[para_ind])
	text_lines = []

	for (i, l) in enumerate(para_lines):
	pg_ind, line_ind, _, text = l
	text_lines.append(text)

	ind, score = semantic_match(sent, text_lines)
	pg_ind, line_ind, _, text = para_lines[ind]
	line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
	line_bbox = line_data[0:-1]
	image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
	image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
	cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)


	def process_file(folderpath, draw=False):
	paras_text = get_paragraphed_text(folderpath)
	maj = get_majority_author_sentence(paras_text)
	other_data = get_other_justice_sentences(paras_text, maj[1])
	opinions_df = split(paras_text, maj, other_data)
	opinions_df.to_csv(folderpath + '/opinions.csv', index=False)

	if draw:
	draw_line_above_sent(folderpath, maj[0], maj[1])
	for c in other_data['Concurrences']:
	draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
	for d in other_data['Dissents']:
	draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))