cools commited on
Commit
f586d1e
·
1 Parent(s): cc79f5a

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +19 -17
TextProcessor.py CHANGED
@@ -4,27 +4,30 @@ import fitz
4
  import re
5
  import cv2
6
 
 
7
  def paragraphs(folderpath):
8
  doc = fitz.open(folderpath + '/opinion.pdf')
9
  df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
10
  indices = list(df.index)
11
  pg_indices = df['Pg Ind'].tolist()
12
 
13
- x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines = [], [], [], [], [], [], [], {}
14
  paras = []
15
  for (i, pg_ind) in enumerate(pg_indices):
16
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
17
- pg_x1s = []
18
- for (j,n) in enumerate(lines):
19
  x1s.append(n[0])
20
  y1s.append(n[1])
21
  x2s.append(n[2])
22
  y2s.append(n[3])
23
  line_texts.append(n[4])
24
  pg_x1s.append(n[0])
 
25
  pg_inds.append(i)
26
  line_inds.append(j)
27
  baselines[i] = min(pg_x1s)
 
28
 
29
  for (j, line_text) in enumerate(line_texts):
30
  if j == 0:
@@ -32,29 +35,28 @@ def paragraphs(folderpath):
32
  continue
33
 
34
  if len(line_texts[j]) > 0:
35
- prior_endswith_period = re.search('\.([^A-z]{0,2})$', line_texts[j-1].strip()) is not None
36
-
37
- prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j-1].strip()) is not None
38
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
39
- prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j-1].strip()) is not None
40
  current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
41
- prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j-1].strip()) is not None
42
 
43
- current_upper = line_text[0].isupper()
44
- current_tabbed = x1s[j]-baselines[pg_inds[j]] > 7
45
- prior_tabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 7
46
- prior_supertabbed = x1s[j-1]-baselines[pg_inds[j-1]] > 18
47
- current_supertabbed = x1s[j]-baselines[pg_inds[j]] > 18
48
- prior_more_left = x1s[j]-x1s[j-1] > 7
49
 
50
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
51
- prior_period_current_tabbed = (prior_endswith_period and current_tabbed and (prior_more_left or (not prior_supertabbed and not current_supertabbed)))
52
 
53
  if is_section_header or prior_period_current_tabbed:
54
  paras.append(para)
55
  para = []
56
  # print('\n')
57
- # print(str(j) + ':\t' + str(prior_endswith_period) + '\t' + str(current_tabbed) + '\t' + str(prior_more_left)+ '\t' + str(prior_supertabbed) + '\t' + str(current_supertabbed) +'\t' + line_text)
58
  para.append((pg_inds[j], line_inds[j], line_text))
59
  paras.append(para)
60
  paras_df = pd.DataFrame({'Lines': paras})
@@ -78,4 +80,4 @@ def process_file(folderpath):
78
  x1, y1, x2, y2, para_first_line, pg_ind = indent
79
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
80
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
81
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 
4
  import re
5
  import cv2
6
 
7
+
8
  def paragraphs(folderpath):
9
  doc = fitz.open(folderpath + '/opinion.pdf')
10
  df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
11
  indices = list(df.index)
12
  pg_indices = df['Pg Ind'].tolist()
13
 
14
+ x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
15
  paras = []
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
+ pg_x1s, pg_x2s = [], []
19
+ for (j, n) in enumerate(lines):
20
  x1s.append(n[0])
21
  y1s.append(n[1])
22
  x2s.append(n[2])
23
  y2s.append(n[3])
24
  line_texts.append(n[4])
25
  pg_x1s.append(n[0])
26
+ pg_x2s.append(n[2])
27
  pg_inds.append(i)
28
  line_inds.append(j)
29
  baselines[i] = min(pg_x1s)
30
+ rights[i] = max(pg_x2s)
31
 
32
  for (j, line_text) in enumerate(line_texts):
33
  if j == 0:
 
35
  continue
36
 
37
  if len(line_texts[j]) > 0:
38
+ prior_endswith_period = re.search('\.([^A-z]{0,2})$', line_texts[j - 1].strip()) is not None
39
+ prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
 
40
  current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
41
+ prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
42
  current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
43
+ prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None
44
 
45
+ current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
46
+ prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
47
+ prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 18
48
+ current_supertabbed = x1s[j] - baselines[pg_inds[j]] > 18
49
+ prior_more_left = x1s[j] - x1s[j - 1] > 7
50
+ prior_right_margin = abs(x2s[j - 1] - rights[pg_inds[j - 1]]) < 10
51
 
52
  is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
53
+ prior_period_current_tabbed = (prior_endswith_period and current_tabbed and (prior_more_left or prior_right_margin or (not prior_supertabbed and not current_supertabbed)))
54
 
55
  if is_section_header or prior_period_current_tabbed:
56
  paras.append(para)
57
  para = []
58
  # print('\n')
59
+ # print(str(j) + ':\t' + str(prior_endswith_period) + '\t' + str(current_tabbed) + '\t' + str(prior_more_left)+ '\t' + str(prior_supertabbed) + '\t' + str(prior_right_margin)+ '\t' + str(current_supertabbed) +'\t' + line_text)
60
  para.append((pg_inds[j], line_inds[j], line_text))
61
  paras.append(para)
62
  paras_df = pd.DataFrame({'Lines': paras})
 
80
  x1, y1, x2, y2, para_first_line, pg_ind = indent
81
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
82
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
83
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)