cools commited on
Commit
9519a3b
·
1 Parent(s): f586d1e

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +17 -7
TextProcessor.py CHANGED
@@ -35,7 +35,10 @@ def paragraphs(folderpath):
35
  continue
36
 
37
  if len(line_texts[j]) > 0:
38
- prior_endswith_period = re.search('\.([^A-z]{0,2})$', line_texts[j - 1].strip()) is not None
 
 
 
39
  prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
40
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
41
  prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
@@ -46,18 +49,23 @@ def paragraphs(folderpath):
46
  prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
47
  prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 18
48
  current_supertabbed = x1s[j] - baselines[pg_inds[j]] > 18
49
- prior_more_left = x1s[j] - x1s[j - 1] > 7
50
- prior_right_margin = abs(x2s[j - 1] - rights[pg_inds[j - 1]]) < 10
51
 
52
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
53
- prior_period_current_tabbed = (prior_endswith_period and current_tabbed and (prior_more_left or prior_right_margin or (not prior_supertabbed and not current_supertabbed)))
 
 
 
 
54
 
55
- if is_section_header or prior_period_current_tabbed:
56
  paras.append(para)
57
  para = []
58
  # print('\n')
59
- # print(str(j) + ':\t' + str(prior_endswith_period) + '\t' + str(current_tabbed) + '\t' + str(prior_more_left)+ '\t' + str(prior_supertabbed) + '\t' + str(prior_right_margin)+ '\t' + str(current_supertabbed) +'\t' + line_text)
60
  para.append((pg_inds[j], line_inds[j], line_text))
 
61
  paras.append(para)
62
  paras_df = pd.DataFrame({'Lines': paras})
63
  return paras_df
@@ -80,4 +88,6 @@ def process_file(folderpath):
80
  x1, y1, x2, y2, para_first_line, pg_ind = indent
81
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
82
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
83
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 
 
 
35
  continue
36
 
37
  if len(line_texts[j]) > 0:
38
+ prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
39
+ current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
40
+
41
+ prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None # Include colon?
42
  prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
43
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
44
  prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
 
49
  prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
50
  prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 18
51
  current_supertabbed = x1s[j] - baselines[pg_inds[j]] > 18
52
+ prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
53
+ prior_right_margin = x1s[j - 1] > prior_median
54
 
55
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
56
+ is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
57
+ is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
58
+ is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed)
59
+ is_after_disposition = (prior_right_margin and current_tabbed)
60
+ is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed)
61
 
62
+ if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
63
  paras.append(para)
64
  para = []
65
  # print('\n')
66
+ # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + line_text)
67
  para.append((pg_inds[j], line_inds[j], line_text))
68
+
69
  paras.append(para)
70
  paras_df = pd.DataFrame({'Lines': paras})
71
  return paras_df
 
88
  x1, y1, x2, y2, para_first_line, pg_ind = indent
89
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
90
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
91
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
92
+
93
+ process_file('PDF Cases/462_122')