Update TextProcessor.py
Browse files- TextProcessor.py +23 -25
TextProcessor.py
CHANGED
|
@@ -16,7 +16,7 @@ def paragraphs(folderpath):
|
|
| 16 |
for (i, pg_ind) in enumerate(pg_indices):
|
| 17 |
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
|
| 18 |
pg_x1s, pg_x2s = [], []
|
| 19 |
-
for (j,
|
| 20 |
x1s.append(n[0])
|
| 21 |
y1s.append(n[1])
|
| 22 |
x2s.append(n[2])
|
|
@@ -34,44 +34,42 @@ def paragraphs(folderpath):
|
|
| 34 |
if j == 0:
|
| 35 |
para = []
|
| 36 |
continue
|
| 37 |
-
|
| 38 |
if len(line_texts[j]) > 0:
|
| 39 |
-
prior_median = (baselines[pg_inds[j
|
| 40 |
-
current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]])
|
| 41 |
-
|
| 42 |
-
prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',
|
| 43 |
-
|
| 44 |
-
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
|
| 45 |
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
|
| 46 |
-
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j
|
| 47 |
current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
|
| 48 |
-
prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j
|
| 49 |
-
|
| 50 |
-
current_tabbed = x1s[j]
|
| 51 |
-
prior_tabbed = x1s[j
|
| 52 |
-
prior_supertabbed = x1s[j
|
| 53 |
-
current_supertabbed = x1s[j]
|
| 54 |
-
prior_more_left = (x1s[j]
|
| 55 |
-
prior_right_margin = x1s[j
|
| 56 |
|
| 57 |
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
|
| 58 |
-
is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed)
|
| 59 |
-
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left)
|
| 60 |
-
is_after_blockquote = (prior_endswith_period and not current_supertabbed and
|
| 61 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
| 62 |
is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
|
| 63 |
-
|
| 64 |
-
if is_start_blockquote and not is_section_header
|
| 65 |
is_inblock = True
|
| 66 |
if is_after_blockquote:
|
| 67 |
is_inblock = False
|
| 68 |
-
|
| 69 |
if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 70 |
paras.append(para)
|
| 71 |
para = []
|
| 72 |
print('\n')
|
| 73 |
-
print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(
|
| 74 |
-
is_after_blockquote) + '\t' + str(is_after_disposition) + '\t' + line_text)
|
| 75 |
para.append((pg_inds[j], line_inds[j], line_text))
|
| 76 |
|
| 77 |
paras.append(para)
|
|
|
|
| 16 |
for (i, pg_ind) in enumerate(pg_indices):
|
| 17 |
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
|
| 18 |
pg_x1s, pg_x2s = [], []
|
| 19 |
+
for (j,n) in enumerate(lines):
|
| 20 |
x1s.append(n[0])
|
| 21 |
y1s.append(n[1])
|
| 22 |
x2s.append(n[2])
|
|
|
|
| 34 |
if j == 0:
|
| 35 |
para = []
|
| 36 |
continue
|
| 37 |
+
|
| 38 |
if len(line_texts[j]) > 0:
|
| 39 |
+
prior_median = (baselines[pg_inds[j-1]] + rights[pg_inds[j-1]])/2
|
| 40 |
+
current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]])/2
|
| 41 |
+
|
| 42 |
+
prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$', line_texts[j-1].strip()) is not None # Include colon?
|
| 43 |
+
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j-1].strip()) is not None
|
|
|
|
| 44 |
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
|
| 45 |
+
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j-1].strip()) is not None
|
| 46 |
current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
|
| 47 |
+
prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j-1].strip()) is not None
|
| 48 |
+
|
| 49 |
+
current_tabbed = x1s[j]-baselines[pg_inds[j]] > 7
|
| 50 |
+
prior_tabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 7
|
| 51 |
+
prior_supertabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 18
|
| 52 |
+
current_supertabbed = x1s[j]-baselines[pg_inds[j]] > 18
|
| 53 |
+
prior_more_left = (x1s[j]-baselines[pg_inds[j]])-(x1s[j-1]-baselines[pg_inds[j-1]]) > 7
|
| 54 |
+
prior_right_margin = x1s[j-1] > prior_median
|
| 55 |
|
| 56 |
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
|
| 57 |
+
is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
|
| 58 |
+
is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
|
| 59 |
+
is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
|
| 60 |
is_after_disposition = (prior_right_margin and current_tabbed)
|
| 61 |
is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
|
| 62 |
+
|
| 63 |
+
if is_start_blockquote and not is_section_header:
|
| 64 |
is_inblock = True
|
| 65 |
if is_after_blockquote:
|
| 66 |
is_inblock = False
|
| 67 |
+
|
| 68 |
if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
|
| 69 |
paras.append(para)
|
| 70 |
para = []
|
| 71 |
print('\n')
|
| 72 |
+
print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + line_text)
|
|
|
|
| 73 |
para.append((pg_inds[j], line_inds[j], line_text))
|
| 74 |
|
| 75 |
paras.append(para)
|