Update TextProcessor.py
Browse files- TextProcessor.py +19 -17
TextProcessor.py
CHANGED
|
@@ -4,27 +4,30 @@ import fitz
|
|
| 4 |
import re
|
| 5 |
import cv2
|
| 6 |
|
|
|
|
| 7 |
def paragraphs(folderpath):
|
| 8 |
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 9 |
df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
|
| 10 |
indices = list(df.index)
|
| 11 |
pg_indices = df['Pg Ind'].tolist()
|
| 12 |
|
| 13 |
-
x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines = [], [], [], [], [], [], [], {}
|
| 14 |
paras = []
|
| 15 |
for (i, pg_ind) in enumerate(pg_indices):
|
| 16 |
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
|
| 17 |
-
pg_x1s = []
|
| 18 |
-
for (j,n) in enumerate(lines):
|
| 19 |
x1s.append(n[0])
|
| 20 |
y1s.append(n[1])
|
| 21 |
x2s.append(n[2])
|
| 22 |
y2s.append(n[3])
|
| 23 |
line_texts.append(n[4])
|
| 24 |
pg_x1s.append(n[0])
|
|
|
|
| 25 |
pg_inds.append(i)
|
| 26 |
line_inds.append(j)
|
| 27 |
baselines[i] = min(pg_x1s)
|
|
|
|
| 28 |
|
| 29 |
for (j, line_text) in enumerate(line_texts):
|
| 30 |
if j == 0:
|
|
@@ -32,29 +35,28 @@ def paragraphs(folderpath):
|
|
| 32 |
continue
|
| 33 |
|
| 34 |
if len(line_texts[j]) > 0:
|
| 35 |
-
prior_endswith_period = re.search('\.([^A-z]{0,2})$', line_texts[j-1].strip()) is not None
|
| 36 |
-
|
| 37 |
-
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j-1].strip()) is not None
|
| 38 |
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
|
| 39 |
-
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j-1].strip()) is not None
|
| 40 |
current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
|
| 41 |
-
prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j-1].strip()) is not None
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
|
| 50 |
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
|
| 51 |
-
prior_period_current_tabbed = (prior_endswith_period and current_tabbed and (prior_more_left or (not prior_supertabbed and not current_supertabbed)))
|
| 52 |
|
| 53 |
if is_section_header or prior_period_current_tabbed:
|
| 54 |
paras.append(para)
|
| 55 |
para = []
|
| 56 |
# print('\n')
|
| 57 |
-
# print(str(j) + ':\t' + str(prior_endswith_period) + '\t' + str(current_tabbed) + '\t' + str(prior_more_left)+ '\t' + str(prior_supertabbed) + '\t' + str(current_supertabbed) +'\t' + line_text)
|
| 58 |
para.append((pg_inds[j], line_inds[j], line_text))
|
| 59 |
paras.append(para)
|
| 60 |
paras_df = pd.DataFrame({'Lines': paras})
|
|
@@ -78,4 +80,4 @@ def process_file(folderpath):
|
|
| 78 |
x1, y1, x2, y2, para_first_line, pg_ind = indent
|
| 79 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 80 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 81 |
-
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
|
|
|
| 4 |
import re
|
| 5 |
import cv2
|
| 6 |
|
| 7 |
+
|
| 8 |
def paragraphs(folderpath):
|
| 9 |
doc = fitz.open(folderpath + '/opinion.pdf')
|
| 10 |
df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
|
| 11 |
indices = list(df.index)
|
| 12 |
pg_indices = df['Pg Ind'].tolist()
|
| 13 |
|
| 14 |
+
x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
|
| 15 |
paras = []
|
| 16 |
for (i, pg_ind) in enumerate(pg_indices):
|
| 17 |
lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
|
| 18 |
+
pg_x1s, pg_x2s = [], []
|
| 19 |
+
for (j, n) in enumerate(lines):
|
| 20 |
x1s.append(n[0])
|
| 21 |
y1s.append(n[1])
|
| 22 |
x2s.append(n[2])
|
| 23 |
y2s.append(n[3])
|
| 24 |
line_texts.append(n[4])
|
| 25 |
pg_x1s.append(n[0])
|
| 26 |
+
pg_x2s.append(n[2])
|
| 27 |
pg_inds.append(i)
|
| 28 |
line_inds.append(j)
|
| 29 |
baselines[i] = min(pg_x1s)
|
| 30 |
+
rights[i] = max(pg_x2s)
|
| 31 |
|
| 32 |
for (j, line_text) in enumerate(line_texts):
|
| 33 |
if j == 0:
|
|
|
|
| 35 |
continue
|
| 36 |
|
| 37 |
if len(line_texts[j]) > 0:
|
| 38 |
+
prior_endswith_period = re.search('\.([^A-z]{0,2})$', line_texts[j - 1].strip()) is not None
|
| 39 |
+
prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
|
|
|
|
| 40 |
current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
|
| 41 |
+
prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
|
| 42 |
current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
|
| 43 |
+
prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None
|
| 44 |
|
| 45 |
+
current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
|
| 46 |
+
prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
|
| 47 |
+
prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 18
|
| 48 |
+
current_supertabbed = x1s[j] - baselines[pg_inds[j]] > 18
|
| 49 |
+
prior_more_left = x1s[j] - x1s[j - 1] > 7
|
| 50 |
+
prior_right_margin = abs(x2s[j - 1] - rights[pg_inds[j - 1]]) < 10
|
| 51 |
|
| 52 |
is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
|
| 53 |
+
prior_period_current_tabbed = (prior_endswith_period and current_tabbed and (prior_more_left or prior_right_margin or (not prior_supertabbed and not current_supertabbed)))
|
| 54 |
|
| 55 |
if is_section_header or prior_period_current_tabbed:
|
| 56 |
paras.append(para)
|
| 57 |
para = []
|
| 58 |
# print('\n')
|
| 59 |
+
# print(str(j) + ':\t' + str(prior_endswith_period) + '\t' + str(current_tabbed) + '\t' + str(prior_more_left)+ '\t' + str(prior_supertabbed) + '\t' + str(prior_right_margin)+ '\t' + str(current_supertabbed) +'\t' + line_text)
|
| 60 |
para.append((pg_inds[j], line_inds[j], line_text))
|
| 61 |
paras.append(para)
|
| 62 |
paras_df = pd.DataFrame({'Lines': paras})
|
|
|
|
| 80 |
x1, y1, x2, y2, para_first_line, pg_ind = indent
|
| 81 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 82 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 83 |
+
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|