cools commited on
Commit
ee033d3
·
1 Parent(s): 3aa9c84

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +4 -5
TextProcessor.py CHANGED
@@ -1,4 +1,3 @@
1
- # This file converts the images into text
2
  import pandas as pd
3
  import numpy as np
4
  import fitz
@@ -53,20 +52,21 @@ def paragraphs(folderpath):
53
  current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
54
  prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
55
  prior_right_margin = x1s[j - 1] > prior_median
 
56
 
57
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
58
  is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
59
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
60
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
61
  is_after_disposition = (prior_right_margin and current_tabbed)
 
62
 
63
  if is_start_blockquote and not is_section_header:
64
  is_inblock = True
65
- # print("START BLOCK")
66
  if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
67
  is_inblock = False
68
 
69
- if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition:
70
  paras.append(para)
71
  para = []
72
  # print('\n')
@@ -96,5 +96,4 @@ def process_file(folderpath):
96
  x1, y1, x2, y2, para_first_line, pg_ind = indent
97
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
98
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
99
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
100
-
 
 
1
  import pandas as pd
2
  import numpy as np
3
  import fitz
 
52
  current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
53
  prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
54
  prior_right_margin = x1s[j - 1] > prior_median
55
+ current_right_margin = x1s[j] > prior_median
56
 
57
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
58
  is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
59
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
60
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
61
  is_after_disposition = (prior_right_margin and current_tabbed)
62
+ is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower()) or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower())
63
 
64
  if is_start_blockquote and not is_section_header:
65
  is_inblock = True
 
66
  if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
67
  is_inblock = False
68
 
69
+ if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
70
  paras.append(para)
71
  para = []
72
  # print('\n')
 
96
  x1, y1, x2, y2, para_first_line, pg_ind = indent
97
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
98
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
99
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)