cools commited on
Commit
6edc22c
·
1 Parent(s): 2bc03b3

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +5 -2
TextProcessor.py CHANGED
@@ -12,7 +12,7 @@ def paragraphs(folderpath):
12
  pg_indices = df['Pg Ind'].tolist()
13
 
14
  x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
15
- paras = []
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
  pg_x1s, pg_x2s = [], []
@@ -67,13 +67,15 @@ def paragraphs(folderpath):
67
 
68
  if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
69
  paras.append(para)
 
70
  para = []
71
  # print('\n')
72
  # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_after_oneline_paragraph) + '\t' + line_text)
73
  para.append((pg_inds[j], line_inds[j], line_text))
74
 
75
  paras.append(para)
76
- paras_df = pd.DataFrame({'Lines': paras})
 
77
  return paras_df
78
 
79
  def process_file(folderpath):
@@ -96,3 +98,4 @@ def process_file(folderpath):
96
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
97
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
98
 
 
 
12
  pg_indices = df['Pg Ind'].tolist()
13
 
14
  x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
15
+ paras, types = [], []
16
  for (i, pg_ind) in enumerate(pg_indices):
17
  lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
18
  pg_x1s, pg_x2s = [], []
 
67
 
68
  if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition:
69
  paras.append(para)
70
+ types.append(is_inblock)
71
  para = []
72
  # print('\n')
73
  # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_after_oneline_paragraph) + '\t' + line_text)
74
  para.append((pg_inds[j], line_inds[j], line_text))
75
 
76
  paras.append(para)
77
+ types.append(is_inblock)
78
+ paras_df = pd.DataFrame({'Lines': paras, 'Block Quote': types})
79
  return paras_df
80
 
81
  def process_file(folderpath):
 
98
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
99
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
100
 
101
+ process_file('PDF Cases/333_178')