Update TextProcessor.py
Browse files- TextProcessor.py +9 -14
TextProcessor.py
CHANGED
|
@@ -12,7 +12,7 @@ def paragraphs(folderpath):
|
|
| 12 |
pg_indices = df['Pg Ind'].tolist()
|
| 13 |
|
| 14 |
x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
|
| 15 |
-
paras
|
| 16 |
for (i, pg_ind) in enumerate(pg_indices):
|
| 17 |
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
|
| 18 |
pg_x1s, pg_x2s = [], []
|
|
@@ -48,8 +48,8 @@ def paragraphs(folderpath):
|
|
| 48 |
|
| 49 |
current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
|
| 50 |
prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
|
| 51 |
-
prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >=
|
| 52 |
-
current_supertabbed = x1s[j] - baselines[pg_inds[j]] >=
|
| 53 |
prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
|
| 54 |
prior_right_margin = x1s[j - 1] > prior_median
|
| 55 |
|
|
@@ -58,24 +58,21 @@ def paragraphs(folderpath):
|
|
| 58 |
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
|
| 59 |
is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
|
| 60 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
| 61 |
-
# is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
|
| 62 |
|
| 63 |
if is_start_blockquote and not is_section_header:
|
| 64 |
is_inblock = True
|
| 65 |
-
|
|
|
|
| 66 |
is_inblock = False
|
| 67 |
|
| 68 |
if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 69 |
paras.append(para)
|
| 70 |
-
types.append(is_inblock)
|
| 71 |
para = []
|
| 72 |
# print('\n')
|
| 73 |
-
# print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote)
|
| 74 |
-
para.append((pg_inds[j], line_inds[j], line_text))
|
| 75 |
-
|
| 76 |
paras.append(para)
|
| 77 |
-
|
| 78 |
-
paras_df = pd.DataFrame({'Lines': paras, 'Block Quote': types})
|
| 79 |
return paras_df
|
| 80 |
|
| 81 |
def process_file(folderpath):
|
|
@@ -86,7 +83,7 @@ def process_file(folderpath):
|
|
| 86 |
indents = []
|
| 87 |
for (i, para_lines) in enumerate(paras_lines):
|
| 88 |
para = []
|
| 89 |
-
para_start_pg_ind, para_start_line_ind, para_first_line = para_lines[0]
|
| 90 |
page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
|
| 91 |
pg_lines = eval(page_df['Lines'].tolist()[0])
|
| 92 |
x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
|
|
@@ -97,5 +94,3 @@ def process_file(folderpath):
|
|
| 97 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 98 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 99 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 100 |
-
|
| 101 |
-
process_file('PDF Cases/333_178')
|
|
|
|
| 12 |
pg_indices = df['Pg Ind'].tolist()
|
| 13 |
|
| 14 |
x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
|
| 15 |
+
paras = []
|
| 16 |
for (i, pg_ind) in enumerate(pg_indices):
|
| 17 |
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
|
| 18 |
pg_x1s, pg_x2s = [], []
|
|
|
|
| 48 |
|
| 49 |
current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
|
| 50 |
prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
|
| 51 |
+
prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 14
|
| 52 |
+
current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
|
| 53 |
prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
|
| 54 |
prior_right_margin = x1s[j - 1] > prior_median
|
| 55 |
|
|
|
|
| 58 |
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
|
| 59 |
is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
|
| 60 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
|
|
|
| 61 |
|
| 62 |
if is_start_blockquote and not is_section_header:
|
| 63 |
is_inblock = True
|
| 64 |
+
# print("START BLOCK")
|
| 65 |
+
if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
|
| 66 |
is_inblock = False
|
| 67 |
|
| 68 |
if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 69 |
paras.append(para)
|
|
|
|
| 70 |
para = []
|
| 71 |
# print('\n')
|
| 72 |
+
# print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
|
| 73 |
+
para.append((pg_inds[j], line_inds[j], is_inblock, line_text))
|
|
|
|
| 74 |
paras.append(para)
|
| 75 |
+
paras_df = pd.DataFrame({'Lines': paras})
|
|
|
|
| 76 |
return paras_df
|
| 77 |
|
| 78 |
def process_file(folderpath):
|
|
|
|
| 83 |
indents = []
|
| 84 |
for (i, para_lines) in enumerate(paras_lines):
|
| 85 |
para = []
|
| 86 |
+
para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
|
| 87 |
page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
|
| 88 |
pg_lines = eval(page_df['Lines'].tolist()[0])
|
| 89 |
x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
|
|
|
|
| 94 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 95 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 96 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
|
|
|
|
|