cools commited on
Commit
107f597
·
1 Parent(s): fb4beeb

Delete Tagger.py

Browse files
Files changed (1) hide show
  1. Tagger.py +0 -142
Tagger.py DELETED
@@ -1,142 +0,0 @@
1
- # This file tags the major text
2
- import pandas as pd
3
- import numpy as np
4
- import re
5
- from sentence_transformers import SentenceTransformer
6
- from sentence_transformers.util import cos_sim
7
- import cv2
8
- import nltk
9
- nltk.download('punkt')
10
- from nltk.tokenize import sent_tokenize, word_tokenize
11
-
12
-
13
- model = SentenceTransformer('all-mpnet-base-v2')
14
-
15
- def get_paragraphed_text(folderpath):
16
- paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
17
- paras_lines = [eval(p) for p in paras_df['Lines'].tolist()]
18
- paras_text = []
19
- for (i, para) in enumerate(paras_lines):
20
- para_lines = [l[-1].strip() for l in para]
21
- paras_text.append(" ".join(para_lines).strip().replace(' ', ' '))
22
- return paras_text
23
-
24
- def semantic_match(template, corpus):
25
- embs = model.encode(corpus)
26
- if type(template) == list:
27
- template_emb = model.encode(template)
28
- else:
29
- template_emb = model.encode([template])
30
- scores = cos_sim(embs, template_emb)
31
- return np.argmax(scores), max(scores)
32
-
33
- def get_majority_author_sentence(paras_text):
34
- for (i,pt) in enumerate(paras_text):
35
- sents = sent_tokenize(pt)
36
- for (j,s) in enumerate(sents):
37
- s = s.lower()
38
- if ("justice" in s and "opinion" in s and "court" in s and ("deliver" in s or "announc" in s)):
39
- if j != 0 and j != len(sents)-1:
40
- print("Located, but not within first or last paragraph")
41
- return [s, i, 0]
42
-
43
- for (j,s) in enumerate(sents): # Per curiam
44
- s = s.lower()
45
- if ("per" in s and "curiam" in s):
46
- if j != 0 and j != len(sents)-1:
47
- print("Located, but not within first or last paragraph")
48
- return [s, i, 0]
49
- raise Exception("Could Not Locate Authoring Justice Sentence/Paragraph")
50
-
51
- def get_other_justices_sentences(paras_text, ind_maj):
52
- data = {}
53
- counter = 0
54
- data['Concurrences'], data['Dissents'], data['Recused'], last = [], [], [], None
55
- for (i,pt) in enumerate(paras_text):
56
- if i < ind_maj:
57
- continue
58
- sents = sent_tokenize(pt)
59
- for (j,s) in enumerate(sents):
60
- s = s.lower()
61
- if "justice" in s:
62
- if re.search(',\s?concurring', s) is not None and re.search('\([A-z,\s]*concurring[A-z,\s]*\)', s) is None: # Regex catches 'Justice (concurring...)'
63
- counter += 1
64
- last = "C"
65
- data['Concurrences'].append((s,i,counter))
66
- elif (re.search(',\s?dissenting', s) or "dissent" in s[-9:].strip()) and re.search('\([A-z,\s]*dissenting[A-z,\s]*\)', s) is None:
67
- counter += 1
68
- data['Dissents'].append((s,i,counter))
69
- last = "D"
70
- elif "join" in s and s.index('join') > s.index('justice') and len(s.split(' ')) < 15:
71
- counter += 1
72
- if last == "C":
73
- data['Concurrences'].append((s,i,counter))
74
- if last == "D":
75
- data['Dissents'].append((s,i,counter))
76
- if "took no part" in s: # This may not be triggered as often?
77
- counter += 1
78
- data['Recused'].append((s,i,counter))
79
- return data
80
-
81
-
82
- def split(paras_text, maj, other_data):
83
- opinions = []
84
- opinions.append(('Majority', maj[0], maj[1], maj[2]))
85
- for c in other_data['Concurrences']:
86
- opinions.append(('Concurrence', c[0], c[1], c[2]))
87
- for d in other_data['Dissents']:
88
- opinions.append(('Dissent', d[0], d[1], d[2]))
89
- for r in other_data['Recused']:
90
- opinions.append(('Recused', r[0], r[1], r[2]))
91
-
92
- opinions_data = []
93
- opinions = np.array(opinions)
94
- order = opinions[:, 3].astype(int)
95
- opinions = opinions[order.argsort()]
96
- for (i, opinion) in enumerate(opinions):
97
- if i == len(opinions) - 1:
98
- end_ind = len(paras_text)
99
- else:
100
- end_ind = int(opinions[i + 1][2]) # Next one is where current left off? Or ideally, would also work with all the BS "Supreme Court of US" stuff?
101
- start_ind = int(opinion[2])
102
-
103
- if end_ind == start_ind:
104
- end_ind += 1
105
- o = {'Type': opinion[0], 'Author Sent': opinion[1], 'Start Para Ind': start_ind, 'End Para Ind': end_ind}
106
- o['Text'] = "<PARA>".join(paras_text[start_ind:end_ind])
107
- opinions_data.append(o)
108
- opinions_df = pd.DataFrame(data=opinions_data)
109
- return opinions_df
110
-
111
- def draw_line_above_sent(folderpath, sent, para_ind, color=(0,0,0)):
112
- data_df = pd.read_csv(folderpath + '/data.csv')
113
- paras_df = pd.read_csv(folderpath + '/paragraphs.csv')
114
- para_lines = eval(paras_df['Lines'].tolist()[para_ind])
115
- text_lines = []
116
-
117
- for (i, l) in enumerate(para_lines):
118
- pg_ind, line_ind, _, text = l
119
- text_lines.append(text)
120
-
121
- ind, score = semantic_match(sent, text_lines)
122
- pg_ind, line_ind, _, text = para_lines[ind]
123
- line_data = eval(data_df[data_df['Pg Ind'] == pg_ind]['Lines'].tolist()[0])[line_ind]
124
- line_bbox = line_data[0:-1]
125
- image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
126
- image = cv2.line(image, (line_bbox[0] - 10, line_bbox[1]), (line_bbox[2] + 10, line_bbox[1]), color=color, thickness=2)
127
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
128
-
129
-
130
- def process_file(folderpath, draw=False):
131
- paras_text = get_paragraphed_text(folderpath)
132
- maj = get_majority_author_sentence(paras_text)
133
- other_data = get_other_justices_sentences(paras_text, maj[1])
134
- opinions_df = split(paras_text, maj, other_data)
135
- opinions_df.to_csv(folderpath + '/opinions.csv', index=False)
136
-
137
- if draw:
138
- draw_line_above_sent(folderpath, maj[0], maj[1])
139
- for c in other_data['Concurrences']:
140
- draw_line_above_sent(folderpath, c[0], c[1], color=(0,100,0))
141
- for d in other_data['Dissents']:
142
- draw_line_above_sent(folderpath, d[0], d[1], color=(0,0,100))