cools commited on
Commit
c599bf4
·
1 Parent(s): b0d8a16

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +9 -14
TextProcessor.py CHANGED
@@ -12,7 +12,7 @@ def paragraphs(folderpath):
12
  pg_indices = df['Pg Ind'].tolist()
13
 
14
  x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
15
- paras, types = [], []
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
  pg_x1s, pg_x2s = [], []
@@ -48,8 +48,8 @@ def paragraphs(folderpath):
48
 
49
  current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
50
  prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
51
- prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 11
52
- current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 11
53
  prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
54
  prior_right_margin = x1s[j - 1] > prior_median
55
 
@@ -58,24 +58,21 @@ def paragraphs(folderpath):
58
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
59
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
60
  is_after_disposition = (prior_right_margin and current_tabbed)
61
- # is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed and not is_inblock)
62
 
63
  if is_start_blockquote and not is_section_header:
64
  is_inblock = True
65
- if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
 
66
  is_inblock = False
67
 
68
  if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition:
69
  paras.append(para)
70
- types.append(is_inblock)
71
  para = []
72
  # print('\n')
73
- # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_after_oneline_paragraph) + '\t' + line_text)
74
- para.append((pg_inds[j], line_inds[j], line_text))
75
-
76
  paras.append(para)
77
- types.append(is_inblock)
78
- paras_df = pd.DataFrame({'Lines': paras, 'Block Quote': types})
79
  return paras_df
80
 
81
  def process_file(folderpath):
@@ -86,7 +83,7 @@ def process_file(folderpath):
86
  indents = []
87
  for (i, para_lines) in enumerate(paras_lines):
88
  para = []
89
- para_start_pg_ind, para_start_line_ind, para_first_line = para_lines[0]
90
  page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
91
  pg_lines = eval(page_df['Lines'].tolist()[0])
92
  x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
@@ -97,5 +94,3 @@ def process_file(folderpath):
97
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
98
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
99
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
100
-
101
- process_file('PDF Cases/333_178')
 
12
  pg_indices = df['Pg Ind'].tolist()
13
 
14
  x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
15
+ paras = []
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
  pg_x1s, pg_x2s = [], []
 
48
 
49
  current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
50
  prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
51
+ prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 14
52
+ current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
53
  prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
54
  prior_right_margin = x1s[j - 1] > prior_median
55
 
 
58
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
59
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
60
  is_after_disposition = (prior_right_margin and current_tabbed)
 
61
 
62
  if is_start_blockquote and not is_section_header:
63
  is_inblock = True
64
+ # print("START BLOCK")
65
+ if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
66
  is_inblock = False
67
 
68
  if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition:
69
  paras.append(para)
 
70
  para = []
71
  # print('\n')
72
+ # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
73
+ para.append((pg_inds[j], line_inds[j], is_inblock, line_text))
 
74
  paras.append(para)
75
+ paras_df = pd.DataFrame({'Lines': paras})
 
76
  return paras_df
77
 
78
  def process_file(folderpath):
 
83
  indents = []
84
  for (i, para_lines) in enumerate(paras_lines):
85
  para = []
86
+ para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
87
  page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
88
  pg_lines = eval(page_df['Lines'].tolist()[0])
89
  x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
 
94
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
95
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
96
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)