Update TextProcessor.py
Browse files- TextProcessor.py +10 -5
TextProcessor.py
CHANGED
|
@@ -29,6 +29,7 @@ def paragraphs(folderpath):
|
|
| 29 |
baselines[i] = min(pg_x1s)
|
| 30 |
rights[i] = max(pg_x2s)
|
| 31 |
|
|
|
|
| 32 |
for (j, line_text) in enumerate(line_texts):
|
| 33 |
if j == 0:
|
| 34 |
para = []
|
|
@@ -38,7 +39,8 @@ def paragraphs(folderpath):
|
|
| 38 |
prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
|
| 39 |
current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
|
| 40 |
|
| 41 |
-
prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',
|
|
|
|
| 42 |
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
|
| 43 |
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
|
| 44 |
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
|
|
@@ -57,7 +59,12 @@ def paragraphs(folderpath):
|
|
| 57 |
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
|
| 58 |
is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed)
|
| 59 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
| 60 |
-
is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 63 |
paras.append(para)
|
|
@@ -88,6 +95,4 @@ def process_file(folderpath):
|
|
| 88 |
x1, y1, x2, y2, para_first_line, pg_ind = indent
|
| 89 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 90 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 91 |
-
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 92 |
-
|
| 93 |
-
process_file('PDF Cases/462_122')
|
|
|
|
| 29 |
baselines[i] = min(pg_x1s)
|
| 30 |
rights[i] = max(pg_x2s)
|
| 31 |
|
| 32 |
+
is_inblock = False
|
| 33 |
for (j, line_text) in enumerate(line_texts):
|
| 34 |
if j == 0:
|
| 35 |
para = []
|
|
|
|
| 39 |
prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
|
| 40 |
current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
|
| 41 |
|
| 42 |
+
prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',
|
| 43 |
+
line_texts[j - 1].strip()) is not None # Include colon?
|
| 44 |
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
|
| 45 |
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
|
| 46 |
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
|
|
|
|
| 59 |
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
|
| 60 |
is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed)
|
| 61 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
| 62 |
+
is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
|
| 63 |
+
|
| 64 |
+
if is_start_blockquote:
|
| 65 |
+
is_inblock = True
|
| 66 |
+
if is_after_blockquote:
|
| 67 |
+
is_inblock = False
|
| 68 |
|
| 69 |
if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 70 |
paras.append(para)
|
|
|
|
| 95 |
x1, y1, x2, y2, para_first_line, pg_ind = indent
|
| 96 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 97 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 98 |
+
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
|
|
|
|
|