Update TextProcessor.py
Browse files- TextProcessor.py +5 -1
TextProcessor.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import fitz
|
|
@@ -77,11 +78,13 @@ def paragraphs(folderpath):
|
|
| 77 |
|
| 78 |
def process_file(folderpath):
|
| 79 |
paras_df = paragraphs(folderpath)
|
| 80 |
-
paras_df.to_csv(folderpath + '/paragraphs.csv', index=
|
| 81 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 82 |
paras_lines = paras_df['Lines'].tolist()
|
| 83 |
indents = []
|
| 84 |
for (i, para_lines) in enumerate(paras_lines):
|
|
|
|
|
|
|
| 85 |
para = []
|
| 86 |
para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
|
| 87 |
page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
|
|
@@ -94,3 +97,4 @@ def process_file(folderpath):
|
|
| 94 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 95 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 96 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
|
|
|
|
|
| 1 |
+
# This file converts the images into text
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import fitz
|
|
|
|
| 78 |
|
| 79 |
def process_file(folderpath):
|
| 80 |
paras_df = paragraphs(folderpath)
|
| 81 |
+
paras_df.to_csv(folderpath + '/paragraphs.csv', index=False)
|
| 82 |
data_df = pd.read_csv(folderpath + '/data.csv')
|
| 83 |
paras_lines = paras_df['Lines'].tolist()
|
| 84 |
indents = []
|
| 85 |
for (i, para_lines) in enumerate(paras_lines):
|
| 86 |
+
if para_lines == []:
|
| 87 |
+
continue
|
| 88 |
para = []
|
| 89 |
para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
|
| 90 |
page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
|
|
|
|
| 97 |
image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
|
| 98 |
cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
|
| 99 |
cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)
|
| 100 |
+
|