cools commited on
Commit
e9493dd
·
1 Parent(s): 3aa5dc8

Update ImageProcessor.py

Browse files

Added paragraph functionality

Files changed (1) hide show
  1. ImageProcessor.py +70 -5
ImageProcessor.py CHANGED
@@ -22,17 +22,14 @@ def is_leftmost(image, x, y_top, y_bot):
22
  return np.sum(left_portion) == 0
23
 
24
  def get_indents(filename, body_bbox, page):
25
- # doc = fitz.open(filename)
26
  indented_lines = []
27
-
28
- # for (pg_ind, page) in enumerate(doc):
29
  image = cv2.imread(filename)
30
  body_rect = fitz.Rect(body_bbox)
31
  pg_dict = page.get_text('dict', clip=body_rect)
32
  all_lines = [(int(line['bbox'][0]), int(line['bbox'][1]), int(line['bbox'][2]), int(line['bbox'][3]), line)for block in pg_dict['blocks'] for line in block['lines']]
33
  body_text = page.get_text("text", clip=body_rect).strip()
34
  baseline = min([l[0] for l in all_lines])
35
- indented_inds = [i for (i,l) in enumerate(all_lines) if (l[0]-baseline > 9 and l[0]-baseline < 30 and is_leftmost(image, l[0]-10, l[1], l[3]))]
36
  for i in indented_inds:
37
  indented_lines.append((i, all_lines[i][0], all_lines[i][1], all_lines[i][2], all_lines[i][3]))
38
  return indented_lines
@@ -144,6 +141,68 @@ def get_page_elements(filename, page):
144
 
145
  return page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, indent_lines, image
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  def process_file(folderpath):
148
  pdf2png(folderpath)
149
  doc = fitz.open(folderpath + '/opinion.pdf')
@@ -173,4 +232,10 @@ def process_file(folderpath):
173
  data_df = pd.concat([data_df, row_df], ignore_index=True)
174
  cv2.imwrite(folderpath + '/' + str(ind) + '-processed.png', image)
175
  data_df['Pg Ind'] = data_df['Pg Ind'].astype('int')
176
- data_df.to_csv(folderpath +'/data.csv', index=False)
 
 
 
 
 
 
 
22
  return np.sum(left_portion) == 0
23
 
24
  def get_indents(filename, body_bbox, page):
 
25
  indented_lines = []
 
 
26
  image = cv2.imread(filename)
27
  body_rect = fitz.Rect(body_bbox)
28
  pg_dict = page.get_text('dict', clip=body_rect)
29
  all_lines = [(int(line['bbox'][0]), int(line['bbox'][1]), int(line['bbox'][2]), int(line['bbox'][3]), line)for block in pg_dict['blocks'] for line in block['lines']]
30
  body_text = page.get_text("text", clip=body_rect).strip()
31
  baseline = min([l[0] for l in all_lines])
32
+ indented_inds = [i for (i,l) in enumerate(all_lines) if (l[0]-baseline > 9 and is_leftmost(image, l[0]-12, l[1], l[3]))]# and l[0]-baseline < 30
33
  for i in indented_inds:
34
  indented_lines.append((i, all_lines[i][0], all_lines[i][1], all_lines[i][2], all_lines[i][3]))
35
  return indented_lines
 
141
 
142
  return page_bbox, header_bbox, fn_bbox, body_bbox, case_separator_bbox, indent_lines, image
143
 
144
+ def paragraphs(folderpath):
145
+ doc = fitz.open(folderpath + '/opinion.pdf')
146
+ df = pd.read_csv(folderpath + '/data.csv')
147
+ df = df.replace({np.nan: None})
148
+ nl_inds = df['Indent Lines'].tolist()
149
+ nl_inds = [eval(nli) for nli in nl_inds]
150
+ nl_indents = [nli[1] for page_nli in nl_inds for nli in page_nli]
151
+ nl_inds = [(i, nli[0]) for (i, page_nli) in zip(df['Pg Ind'].tolist(), nl_inds) for nli in page_nli]
152
+ paras = [([], 0, 0, 0)] # Text, indent amount, start pg ind, end pg ind
153
+ para_lines = []
154
+ for (i, page) in enumerate(doc):
155
+ ind = df.index[df['Pg Ind'] == i].tolist()[0]
156
+ body_bbox = [df.iloc[ind]['Body X1'], df.iloc[ind]['Body Y1'], df.iloc[ind]['Body X2'], df.iloc[ind]['Body Y2']]
157
+ case_separator = df.iloc[ind]['Case Separator Y']
158
+ if case_separator is not None:
159
+ body_bbox[-1] = case_separator
160
+ body_rect = fitz.Rect(body_bbox)
161
+ pg_dict = page.get_text('dict', clip=body_rect)
162
+ all_lines = [get_line_text(line) for block in pg_dict['blocks'] for line in block['lines']]
163
+ for (j, line) in enumerate(all_lines):
164
+ if line == "":
165
+ continue
166
+ if (i, j) in nl_inds:
167
+ indent_amt = nl_indents[nl_inds.index((i, j))] # This is for the starting one
168
+ paras.append(([], indent_amt, i, i))
169
+ paras[-1] = list(paras[-1])
170
+ paras[-1][0].append(line.strip())
171
+ paras[-1][-1] = i # Update the page ind
172
+ paras[-1] = tuple(paras[-1])
173
+ paras = block_quotes(paras)
174
+ paras_df = pd.DataFrame(data=paras, index=None, columns=['Text', 'Indent Amount', 'Start Pg Ind', 'End Pg Ind'])
175
+ return paras_df
176
+
177
+ def get_line_text(line):
178
+ words = []
179
+ for s in line['spans']:
180
+ text = s['text'].strip()
181
+ if text != "":
182
+ words.append(text)
183
+ words = " ".join(words)
184
+ return words
185
+
186
+ def block_quotes(paras):
187
+ modified_paras = []
188
+ start_para, end_para, end_quote_passed = None, None, None
189
+ for (i, (para, ind_amt, start_pg_ind, end_pg_ind)) in enumerate(paras):
190
+ if i == len(paras) - 1:
191
+ break
192
+ if len(para) == 1 and "“" == para[0][0] and start_para is None:
193
+ start_para = i
194
+ if len(para) == 1 and "”" == para[0][-1] and start_para is not None:
195
+ end_quote_passed = True
196
+ if len(para) == 1 and (
197
+ paras[i + 1][1] - ind_amt) < -5 and end_para is None and start_para is not None and end_quote_passed:
198
+ end_para = i
199
+ if start_para is not None and end_para is not None:
200
+ para = [p[0][0] for p in paras[start_para:end_para + 1]]
201
+ start_para, end_para, end_quote_passed = None, None, False
202
+ if start_para is None and end_para is None:
203
+ modified_paras.append((para, ind_amt, start_pg_ind, end_pg_ind))
204
+ return modified_paras
205
+
206
  def process_file(folderpath):
207
  pdf2png(folderpath)
208
  doc = fitz.open(folderpath + '/opinion.pdf')
 
232
  data_df = pd.concat([data_df, row_df], ignore_index=True)
233
  cv2.imwrite(folderpath + '/' + str(ind) + '-processed.png', image)
234
  data_df['Pg Ind'] = data_df['Pg Ind'].astype('int')
235
+ data_df.to_csv(folderpath +'/data.csv', index=False)
236
+ paras_df = paragraphs(folderpath)
237
+ paras_df.to_csv(folderpath + '/paragraphs.csv', index=False)
238
+
239
+
240
+
241
+ process_file('PDF Cases/19-896_2135')