cools commited on
Commit
95a5062
·
1 Parent(s): ae3c0d9

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +6 -4
TextProcessor.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  import fitz
@@ -59,7 +60,7 @@ def paragraphs(folderpath):
59
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
60
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
61
  is_after_disposition = (prior_right_margin and current_tabbed)
62
- is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower()) or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower())
63
 
64
  if is_start_blockquote and not is_section_header:
65
  is_inblock = True
@@ -69,9 +70,10 @@ def paragraphs(folderpath):
69
  if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
70
  paras.append(para)
71
  para = []
72
- # print('\n')
73
- # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
74
  para.append((pg_inds[j], line_inds[j], is_inblock, line_text))
 
75
  paras.append(para)
76
  paras_df = pd.DataFrame({'Lines': paras})
77
  return paras_df
@@ -96,4 +98,4 @@ def process_file(folderpath):
96
  x1, y1, x2, y2, para_first_line, pg_ind = indent
97
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
98
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
99
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 
1
+ # This file converts the images into text
2
  import pandas as pd
3
  import numpy as np
4
  import fitz
 
60
  is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
61
  is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
62
  is_after_disposition = (prior_right_margin and current_tabbed)
63
+ is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower() or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower()))
64
 
65
  if is_start_blockquote and not is_section_header:
66
  is_inblock = True
 
70
  if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
71
  paras.append(para)
72
  para = []
73
+ print('\n')
74
+ print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
75
  para.append((pg_inds[j], line_inds[j], is_inblock, line_text))
76
+
77
  paras.append(para)
78
  paras_df = pd.DataFrame({'Lines': paras})
79
  return paras_df
 
98
  x1, y1, x2, y2, para_first_line, pg_ind = indent
99
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
100
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
101
+ cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)