cools commited on
Commit
0c734ac
·
1 Parent(s): b7b8dac

Update TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +5 -1
TextProcessor.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
  import numpy as np
3
  import fitz
@@ -77,11 +78,13 @@ def paragraphs(folderpath):
77
 
78
  def process_file(folderpath):
79
  paras_df = paragraphs(folderpath)
80
- paras_df.to_csv(folderpath + '/paragraphs.csv', index=True)
81
  data_df = pd.read_csv(folderpath + '/data.csv')
82
  paras_lines = paras_df['Lines'].tolist()
83
  indents = []
84
  for (i, para_lines) in enumerate(paras_lines):
 
 
85
  para = []
86
  para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
87
  page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
@@ -94,3 +97,4 @@ def process_file(folderpath):
94
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
95
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
96
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
 
 
1
+ # This file converts the images into text
2
  import pandas as pd
3
  import numpy as np
4
  import fitz
 
78
 
79
  def process_file(folderpath):
80
  paras_df = paragraphs(folderpath)
81
+ paras_df.to_csv(folderpath + '/paragraphs.csv', index=False)
82
  data_df = pd.read_csv(folderpath + '/data.csv')
83
  paras_lines = paras_df['Lines'].tolist()
84
  indents = []
85
  for (i, para_lines) in enumerate(paras_lines):
86
+ if para_lines == []:
87
+ continue
88
  para = []
89
  para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
90
  page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
 
97
  image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
98
  cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
99
  cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
100
+