Update TextProcessor.py
Browse files- TextProcessor.py +4 -5
TextProcessor.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
# This file converts the images into text
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import fitz
|
|
@@ -53,20 +52,21 @@ def paragraphs(folderpath):
|
|
| 53 |
current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
|
| 54 |
prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
|
| 55 |
prior_right_margin = x1s[j - 1] > prior_median
|
|
|
|
| 56 |
|
| 57 |
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
|
| 58 |
is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
|
| 59 |
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
|
| 60 |
is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
|
| 61 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
|
|
|
| 62 |
|
| 63 |
if is_start_blockquote and not is_section_header:
|
| 64 |
is_inblock = True
|
| 65 |
-
# print("START BLOCK")
|
| 66 |
if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
|
| 67 |
is_inblock = False
|
| 68 |
|
| 69 |
-
if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 70 |
paras.append(para)
|
| 71 |
para = []
|
| 72 |
# print('\n')
|
|
@@ -96,5 +96,4 @@ def process_file(folderpath):
|
|
| 96 |
x1, y1, x2, y2, para_first_line, pg_ind = indent
|
| 97 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 98 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 99 |
-
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 100 |
-
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import fitz
|
|
|
|
| 52 |
current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
|
| 53 |
prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
|
| 54 |
prior_right_margin = x1s[j - 1] > prior_median
|
| 55 |
+
current_right_margin = x1s[j] > prior_median
|
| 56 |
|
| 57 |
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
|
| 58 |
is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
|
| 59 |
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
|
| 60 |
is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
|
| 61 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
| 62 |
+
is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower()) or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower())
|
| 63 |
|
| 64 |
if is_start_blockquote and not is_section_header:
|
| 65 |
is_inblock = True
|
|
|
|
| 66 |
if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
|
| 67 |
is_inblock = False
|
| 68 |
|
| 69 |
+
if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
|
| 70 |
paras.append(para)
|
| 71 |
para = []
|
| 72 |
# print('\n')
|
|
|
|
| 96 |
x1, y1, x2, y2, para_first_line, pg_ind = indent
|
| 97 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 98 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 99 |
+
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
|
|