Update Tagger.py
Browse filesUpdated with fixes
Tagger.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import re
|
|
@@ -37,31 +38,73 @@ def get_majority_author_sentence(paras_text):
|
|
| 37 |
if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
|
| 38 |
if j != 0 and j != len(sents)-1:
|
| 39 |
print("Located, but not within first or last paragraph")
|
| 40 |
-
return [s, i]
|
| 41 |
-
|
|
|
|
| 42 |
s = s.lower()
|
| 43 |
if ("per" in s and "curiam" in s):
|
| 44 |
if j != 0 and j != len(sents)-1:
|
| 45 |
print("Located, but not within first or last paragraph")
|
| 46 |
-
return [s, i]
|
| 47 |
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 48 |
|
| 49 |
-
def
|
| 50 |
data = {}
|
| 51 |
-
|
|
|
|
| 52 |
for (i,pt) in enumerate(paras_text):
|
| 53 |
if i < ind_maj:
|
| 54 |
continue
|
| 55 |
sents = sent_tokenize(pt)
|
| 56 |
for (j,s) in enumerate(sents):
|
| 57 |
s = s.lower()
|
| 58 |
-
if "justice" in s
|
| 59 |
-
if "concurring" in s:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
data['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
return data
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
|
| 66 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 67 |
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
|
@@ -80,14 +123,19 @@ def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
|
|
| 80 |
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
|
| 81 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 82 |
|
|
|
|
| 83 |
def process_file(folderpath, draw=False):
|
| 84 |
paras_text = get_paragraphed_text(folderpath)
|
| 85 |
maj = get_majority_author_sentence(paras_text)
|
| 86 |
-
|
|
|
|
|
|
|
| 87 |
|
| 88 |
if draw:
|
| 89 |
draw_line_above_sent(folderpath, maj[0], maj[1])
|
| 90 |
-
for c in
|
| 91 |
draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
|
| 92 |
-
for d in
|
| 93 |
-
draw_line_above_sent(folderpath, d[0], d[1], color=(
|
|
|
|
|
|
|
|
|
| 1 |
+
# This file tags the major text
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import re
|
|
|
|
| 38 |
if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
|
| 39 |
if j != 0 and j != len(sents)-1:
|
| 40 |
print("Located, but not within first or last paragraph")
|
| 41 |
+
return [s, i, 0]
|
| 42 |
+
|
| 43 |
+
for (j,s) in enumerate(sents): # Per curiam
|
| 44 |
s = s.lower()
|
| 45 |
if ("per" in s and "curiam" in s):
|
| 46 |
if j != 0 and j != len(sents)-1:
|
| 47 |
print("Located, but not within first or last paragraph")
|
| 48 |
+
return [s, i, 0]
|
| 49 |
raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
|
| 50 |
|
| 51 |
+
def get_other_justice_sentences(paras_text, ind_maj):
|
| 52 |
data = {}
|
| 53 |
+
counter = 0
|
| 54 |
+
data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
|
| 55 |
for (i,pt) in enumerate(paras_text):
|
| 56 |
if i < ind_maj:
|
| 57 |
continue
|
| 58 |
sents = sent_tokenize(pt)
|
| 59 |
for (j,s) in enumerate(sents):
|
| 60 |
s = s.lower()
|
| 61 |
+
if "justice" in s:
|
| 62 |
+
if ("concurring" in s and "," in s):
|
| 63 |
+
counter += 1
|
| 64 |
+
last = "C"
|
| 65 |
+
data['Concurrences'].append((s,i,counter))
|
| 66 |
+
elif ("dissenting" in s and "," in s):
|
| 67 |
+
counter += 1
|
| 68 |
+
data['Dissents'].append((s,i,counter))
|
| 69 |
+
last = "D"
|
| 70 |
+
elif "join" in s:
|
| 71 |
+
counter += 1
|
| 72 |
+
if last == "C":
|
| 73 |
+
data['Concurrences'].append((s,i,counter))
|
| 74 |
+
if last == "D":
|
| 75 |
+
data['Dissents'].append((s,i,counter))
|
| 76 |
+
if "took no part" in s:
|
| 77 |
+
counter += 1
|
| 78 |
+
data['Recused'].append((s,i, counter))
|
| 79 |
return data
|
| 80 |
|
| 81 |
+
def split(paras_text, maj, other_data):
|
| 82 |
+
opinions = []
|
| 83 |
+
opinions.append(('Majority', maj[0], maj[1], maj[2]))
|
| 84 |
+
for c in other_data['Concurrences']:
|
| 85 |
+
opinions.append(('Concurrence', c[0], c[1], c[2]))
|
| 86 |
+
for d in other_data['Dissents']:
|
| 87 |
+
opinions.append(('Dissent', d[0], d[1], d[2]))
|
| 88 |
+
for r in other_data['Recused']:
|
| 89 |
+
opinions.append(('Recused', r[0], r[1], r[2]))
|
| 90 |
+
|
| 91 |
+
opinions_data = []
|
| 92 |
+
opinions = np.array(opinions)
|
| 93 |
+
order = opinions[:, 3].astype(int)
|
| 94 |
+
opinions = opinions[order.argsort()]
|
| 95 |
+
for (i, opinion) in enumerate(opinions):
|
| 96 |
+
if i == len(opinions) - 1:
|
| 97 |
+
end_ind = len(paras_text)
|
| 98 |
+
else:
|
| 99 |
+
end_ind = int(opinions[i + 1][
|
| 100 |
+
2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
|
| 101 |
+
start_ind = int(opinion[2])
|
| 102 |
+
o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
|
| 103 |
+
o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
|
| 104 |
+
opinions_data.append(o)
|
| 105 |
+
opinions_df = pd.DataFrame(data=opinions_data)
|
| 106 |
+
return opinions_df
|
| 107 |
+
|
| 108 |
def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
|
| 109 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 110 |
paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
|
|
|
|
| 123 |
image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
|
| 124 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 125 |
|
| 126 |
+
|
| 127 |
def process_file(folderpath, draw=False):
|
| 128 |
paras_text = get_paragraphed_text(folderpath)
|
| 129 |
maj = get_majority_author_sentence(paras_text)
|
| 130 |
+
other_data = get_other_justice_sentences(paras_text, maj[1])
|
| 131 |
+
opinions_df = split(paras_text, maj, other_data)
|
| 132 |
+
opinions_df.to_csv(folderpath + '/opinions.csv', index=False)
|
| 133 |
|
| 134 |
if draw:
|
| 135 |
draw_line_above_sent(folderpath, maj[0], maj[1])
|
| 136 |
+
for c in other_data['Concurrences']:
|
| 137 |
draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
|
| 138 |
+
for d in other_data['Dissents']:
|
| 139 |
+
draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))
|
| 140 |
+
|
| 141 |
+
|