cools commited on
Commit
2c92324
·
1 Parent(s): f22d6c0

Update Tagger.py

Browse files

Updated with fixes

Files changed (1) hide show
  1. Tagger.py +62 -14
Tagger.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  import re
@@ -37,31 +38,73 @@ def get_majority_author_sentence(paras_text):
37
  if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
38
  if j != 0 and j != len(sents)-1:
39
  print("Located, but not within first or last paragraph")
40
- return [s, i]
41
- for (j,s) in enumerate(sents): # Per curiam
 
42
  s = s.lower()
43
  if ("per" in s and "curiam" in s):
44
  if j != 0 and j != len(sents)-1:
45
  print("Located, but not within first or last paragraph")
46
- return [s, i]
47
  raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
48
 
49
- def get_other_author_sentence(paras_text, ind_maj):
50
  data = {}
51
- data['Concurrences'], data['Dissents'] = [], []
 
52
  for (i,pt) in enumerate(paras_text):
53
  if i < ind_maj:
54
  continue
55
  sents = sent_tokenize(pt)
56
  for (j,s) in enumerate(sents):
57
  s = s.lower()
58
- if "justice" in s and "," in s:
59
- if "concurring" in s:
60
- data['Concurrences'].append((s,i))
61
- if "dissenting" in s:
62
- data['Dissents'].append((s,i))
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  return data
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
66
  data_df = pd.read_csv(folderpath + '/data.csv')
67
  paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
@@ -80,14 +123,19 @@ def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
80
  image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
81
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
82
 
 
83
  def process_file(folderpath, draw=False):
84
  paras_text = get_paragraphed_text(folderpath)
85
  maj = get_majority_author_sentence(paras_text)
86
- data = get_other_author_sentence(paras_text, maj[1])
 
 
87
 
88
  if draw:
89
  draw_line_above_sent(folderpath, maj[0], maj[1])
90
- for c in data['Concurrences']:
91
  draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
92
- for d in data['Dissents']:
93
- draw_line_above_sent(folderpath, d[0], d[1], color=(100,0,0))
 
 
 
1
+ # This file tags the major text
2
  import pandas as pd
3
  import numpy as np
4
  import re
 
38
  if ("justice" in s and "opinion" in s and "court" in s and ("delivered" in s or "announced" in s)):
39
  if j != 0 and j != len(sents)-1:
40
  print("Located, but not within first or last paragraph")
41
+ return [s, i, 0]
42
+
43
+ for (j,s) in enumerate(sents): # Per curiam
44
  s = s.lower()
45
  if ("per" in s and "curiam" in s):
46
  if j != 0 and j != len(sents)-1:
47
  print("Located, but not within first or last paragraph")
48
+ return [s, i, 0]
49
  raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
50
 
51
+ def get_other_justice_sentences(paras_text, ind_maj):
52
  data = {}
53
+ counter = 0
54
+ data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
55
  for (i,pt) in enumerate(paras_text):
56
  if i < ind_maj:
57
  continue
58
  sents = sent_tokenize(pt)
59
  for (j,s) in enumerate(sents):
60
  s = s.lower()
61
+ if "justice" in s:
62
+ if ("concurring" in s and "," in s):
63
+ counter += 1
64
+ last = "C"
65
+ data['Concurrences'].append((s,i,counter))
66
+ elif ("dissenting" in s and "," in s):
67
+ counter += 1
68
+ data['Dissents'].append((s,i,counter))
69
+ last = "D"
70
+ elif "join" in s:
71
+ counter += 1
72
+ if last == "C":
73
+ data['Concurrences'].append((s,i,counter))
74
+ if last == "D":
75
+ data['Dissents'].append((s,i,counter))
76
+ if "took no part" in s:
77
+ counter += 1
78
+ data['Recused'].append((s,i, counter))
79
  return data
80
 
81
+ def split(paras_text, maj, other_data):
82
+ opinions = []
83
+ opinions.append(('Majority', maj[0], maj[1], maj[2]))
84
+ for c in other_data['Concurrences']:
85
+ opinions.append(('Concurrence', c[0], c[1], c[2]))
86
+ for d in other_data['Dissents']:
87
+ opinions.append(('Dissent', d[0], d[1], d[2]))
88
+ for r in other_data['Recused']:
89
+ opinions.append(('Recused', r[0], r[1], r[2]))
90
+
91
+ opinions_data = []
92
+ opinions = np.array(opinions)
93
+ order = opinions[:, 3].astype(int)
94
+ opinions = opinions[order.argsort()]
95
+ for (i, opinion) in enumerate(opinions):
96
+ if i == len(opinions) - 1:
97
+ end_ind = len(paras_text)
98
+ else:
99
+ end_ind = int(opinions[i + 1][
100
+ 2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
101
+ start_ind = int(opinion[2])
102
+ o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
103
+ o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
104
+ opinions_data.append(o)
105
+ opinions_df = pd.DataFrame(data=opinions_data)
106
+ return opinions_df
107
+
108
  def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
109
  data_df = pd.read_csv(folderpath + '/data.csv')
110
  paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
 
123
  image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
124
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
125
 
126
+
127
  def process_file(folderpath, draw=False):
128
  paras_text = get_paragraphed_text(folderpath)
129
  maj = get_majority_author_sentence(paras_text)
130
+ other_data = get_other_justice_sentences(paras_text, maj[1])
131
+ opinions_df = split(paras_text, maj, other_data)
132
+ opinions_df.to_csv(folderpath + '/opinions.csv', index=False)
133
 
134
  if draw:
135
  draw_line_above_sent(folderpath, maj[0], maj[1])
136
+ for c in other_data['Concurrences']:
137
  draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
138
+ for d in other_data['Dissents']:
139
+ draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))
140
+
141
+