cools commited on
Commit
2bc03b3
·
1 Parent(s): 50ef89f

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +22 -21
TextProcessor.py CHANGED
@@ -16,7 +16,7 @@ def paragraphs(folderpath):
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
  pg_x1s, pg_x2s = [], []
19
- for (j,n) in enumerate(lines):
20
  x1s.append(n[0])
21
  y1s.append(n[1])
22
  x2s.append(n[2])
@@ -34,42 +34,42 @@ def paragraphs(folderpath):
34
  if j == 0:
35
  para = []
36
  continue
37
-
38
  if len(line_texts[j]) > 0:
39
- prior_median = (baselines[pg_inds[j-1]] + rights[pg_inds[j-1]])/2
40
- current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]])/2
41
-
42
- prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$', line_texts[j-1].strip()) is not None # Include colon?
43
- prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j-1].strip()) is not None
44
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
45
- prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j-1].strip()) is not None
46
  current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
47
- prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j-1].strip()) is not None
48
-
49
- current_tabbed = x1s[j]-baselines[pg_inds[j]] > 7
50
- prior_tabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 7
51
- prior_supertabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 18
52
- current_supertabbed = x1s[j]-baselines[pg_inds[j]] > 18
53
- prior_more_left = (x1s[j]-baselines[pg_inds[j]])-(x1s[j-1]-baselines[pg_inds[j-1]]) > 7
54
- prior_right_margin = x1s[j-1] > prior_median
55
 
56
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
57
- is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
58
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
59
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
60
  is_after_disposition = (prior_right_margin and current_tabbed)
61
  is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
62
-
63
  if is_start_blockquote and not is_section_header:
64
  is_inblock = True
65
  if is_after_blockquote:
66
  is_inblock = False
67
-
68
  if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
69
  paras.append(para)
70
  para = []
71
- print('\n')
72
- print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + line_text)
73
  para.append((pg_inds[j], line_inds[j], line_text))
74
 
75
  paras.append(para)
@@ -95,3 +95,4 @@ def process_file(folderpath):
95
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
96
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
97
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 
 
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
  pg_x1s, pg_x2s = [], []
19
+ for (j, n) in enumerate(lines):
20
  x1s.append(n[0])
21
  y1s.append(n[1])
22
  x2s.append(n[2])
 
34
  if j == 0:
35
  para = []
36
  continue
37
+
38
  if len(line_texts[j]) > 0:
39
+ prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
40
+ current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
41
+
42
+ prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None # Include colon?
43
+ prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
44
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
45
+ prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
46
  current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
47
+ prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None
48
+
49
+ current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
50
+ prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
51
+ prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 11
52
+ current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 11
53
+ prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
54
+ prior_right_margin = x1s[j - 1] > prior_median
55
 
56
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
57
+ is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
58
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
59
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
60
  is_after_disposition = (prior_right_margin and current_tabbed)
61
  is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
62
+
63
  if is_start_blockquote and not is_section_header:
64
  is_inblock = True
65
  if is_after_blockquote:
66
  is_inblock = False
67
+
68
  if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
69
  paras.append(para)
70
  para = []
71
+ # print('\n')
72
+ # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_after_oneline_paragraph) + '\t' + line_text)
73
  para.append((pg_inds[j], line_inds[j], line_text))
74
 
75
  paras.append(para)
 
95
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
96
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
97
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
98
+