cools commited on
Commit
1bba217
·
1 Parent(s): e9dfae8

Add Text Processor

Browse files

Paragraph Identifier

Files changed (1) hide show
  1. TextProcessor.py +83 -0
TextProcessor.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import fitz
4
+ import re
5
+ import cv2
6
+
7
+ def paragraphs(folderpath):
8
+ doc = fitz.open(folderpath + '/opinion.pdf')
9
+ df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
10
+ indices = list(df.index)
11
+ pg_indices = df['Pg Ind'].tolist()
12
+
13
+ x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines = [], [], [], [], [], [], [], {}
14
+ paras = []
15
+ for (i, pg_ind) in enumerate(pg_indices):
16
+ lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
17
+ pg_x1s = []
18
+ for (j,n) in enumerate(lines):
19
+ x1s.append(n[0])
20
+ y1s.append(n[1])
21
+ x2s.append(n[2])
22
+ y2s.append(n[3])
23
+ line_texts.append(n[4])
24
+ pg_x1s.append(n[0])
25
+ pg_inds.append(i)
26
+ line_inds.append(j)
27
+ baselines[i] = min(pg_x1s)
28
+
29
+ for (j, line_text) in enumerate(line_texts):
30
+ if j == 0:
31
+ para = []
32
+ continue
33
+
34
+ if len(line_texts[j]) > 0:
35
+ prior_endswith_period = re.search('\.([^A-z]{0,2})$', line_texts[j-1].strip()) is not None
36
+
37
+ prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j-1].strip()) is not None
38
+ current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
39
+ prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j-1].strip()) is not None
40
+ current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
41
+ prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j-1].strip()) is not None
42
+
43
+ current_upper = line_text[0].isupper()
44
+ current_tabbed = x1s[j]-baselines[pg_inds[j]] > 7
45
+ prior_tabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 7
46
+ prior_supertabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 18
47
+ current_supertabbed = x1s[j]-baselines[pg_inds[j]] > 18
48
+ prior_more_left = x1s[j]-x1s[j-1] > 7
49
+
50
+ is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
51
+ prior_period_current_tabbed = (prior_endswith_period and current_tabbed and (prior_more_left or (not prior_supertabbed and not current_supertabbed)))
52
+
53
+ if is_section_header or prior_period_current_tabbed:
54
+ paras.append(para)
55
+ para = []
56
+ # print('\n')
57
+ # print(str(j) + ':\t' + str(prior_endswith_period) + '\t' + str(current_tabbed) + '\t' + str(prior_more_left)+ '\t' + str(prior_supertabbed) + '\t' + str(current_supertabbed) +'\t' + line_text)
58
+ para.append((pg_inds[j], line_inds[j], line_text))
59
+ paras.append(para)
60
+ paras_df = pd.DataFrame({'Lines': paras})
61
+ return paras_df
62
+
63
+ def process_file(folderpath):
64
+ paras_df = paragraphs(folderpath)
65
+ paras_df.to_csv(folderpath + '/paragraphs.csv', index=True)
66
+ data_df = pd.read_csv(folderpath + '/data.csv')
67
+ paras_lines = paras_df['Lines'].tolist()
68
+ indents = []
69
+ for (i, para_lines) in enumerate(paras_lines):
70
+ para = []
71
+ para_start_pg_ind, para_start_line_ind, para_first_line = para_lines[0]
72
+ page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
73
+ pg_lines = eval(page_df['Lines'].tolist()[0])
74
+ x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
75
+ indents.append((x1, y1, x2, y2, para_first_line, para_start_pg_ind))
76
+
77
+ for indent in indents:
78
+ x1, y1, x2, y2, para_first_line, pg_ind = indent
79
+ image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
80
+ cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
81
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
82
+
83
+ process_file('PDF Cases/333_178')