cools commited on
Commit
009dd9c
·
1 Parent(s): 9519a3b

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +10 -5
TextProcessor.py CHANGED
@@ -29,6 +29,7 @@ def paragraphs(folderpath):
29
  baselines[i] = min(pg_x1s)
30
  rights[i] = max(pg_x2s)
31
 
 
32
  for (j, line_text) in enumerate(line_texts):
33
  if j == 0:
34
  para = []
@@ -38,7 +39,8 @@ def paragraphs(folderpath):
38
  prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
39
  current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
40
 
41
- prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None # Include colon?
 
42
  prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
43
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
44
  prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
@@ -57,7 +59,12 @@ def paragraphs(folderpath):
57
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
58
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed)
59
  is_after_disposition = (prior_right_margin and current_tabbed)
60
- is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed)
 
 
 
 
 
61
 
62
  if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
63
  paras.append(para)
@@ -88,6 +95,4 @@ def process_file(folderpath):
88
  x1, y1, x2, y2, para_first_line, pg_ind = indent
89
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
90
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
91
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
92
-
93
- process_file('PDF Cases/462_122')
 
29
  baselines[i] = min(pg_x1s)
30
  rights[i] = max(pg_x2s)
31
 
32
+ is_inblock = False
33
  for (j, line_text) in enumerate(line_texts):
34
  if j == 0:
35
  para = []
 
39
  prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
40
  current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
41
 
42
+ prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',
43
+ line_texts[j - 1].strip()) is not None # Include colon?
44
  prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
45
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
46
  prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
 
59
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
60
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed)
61
  is_after_disposition = (prior_right_margin and current_tabbed)
62
+ is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
63
+
64
+ if is_start_blockquote:
65
+ is_inblock = True
66
+ if is_after_blockquote:
67
+ is_inblock = False
68
 
69
  if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
70
  paras.append(para)
 
95
  x1, y1, x2, y2, para_first_line, pg_ind = indent
96
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
97
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
98
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)