Commit
·
723d054
1
Parent(s):
d159597
Update files/functions.py
Browse files- files/functions.py +3 -14
files/functions.py
CHANGED
|
@@ -80,7 +80,7 @@ else:
|
|
| 80 |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
| 81 |
|
| 82 |
# max PDF page images that will be displayed
|
| 83 |
-
max_imgboxes =
|
| 84 |
examples_dir = 'files/'
|
| 85 |
image_wo_content = examples_dir + "wo_content.png" # image without content
|
| 86 |
pdf_blank = examples_dir + "blank.pdf" # blank PDF
|
|
@@ -366,8 +366,7 @@ def extraction_data_from_image(images):
|
|
| 366 |
if num_imgs > 0:
|
| 367 |
|
| 368 |
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
|
| 369 |
-
custom_config = r'--oem 3 --psm 3 -l eng
|
| 370 |
-
# custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3
|
| 371 |
results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
|
| 372 |
images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
|
| 373 |
|
|
@@ -381,15 +380,11 @@ def extraction_data_from_image(images):
|
|
| 381 |
img = np.array(img, dtype='uint8') # convert PIL to cv2
|
| 382 |
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
|
| 383 |
ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
|
| 384 |
-
# img_filepath = f"img{i}.png"
|
| 385 |
-
# img.save(img_filepath)
|
| 386 |
-
# cv2.imwrite(img_filepath, img)
|
| 387 |
|
| 388 |
# OCR PyTesseract | get langs of page
|
| 389 |
txt = pytesseract.image_to_string(img, config=custom_config)
|
| 390 |
-
|
| 391 |
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
| 392 |
-
|
| 393 |
try:
|
| 394 |
langs = detect_langs(txt)
|
| 395 |
langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
|
|
@@ -398,15 +393,11 @@ def extraction_data_from_image(images):
|
|
| 398 |
langs_string = "eng"
|
| 399 |
langs_string += '+osd'
|
| 400 |
custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
|
| 401 |
-
# print("langs", i, "-", langs_string)
|
| 402 |
|
| 403 |
# OCR PyTesseract | get data
|
| 404 |
results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
|
| 405 |
-
|
| 406 |
# results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
| 407 |
-
# print("results[i].keys()", i, "-",results[i].keys())
|
| 408 |
|
| 409 |
-
# print("factor", factor)
|
| 410 |
lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
|
| 411 |
lines_list.append(lines[i])
|
| 412 |
par_boxes_list.append(par_boxes[i])
|
|
@@ -415,8 +406,6 @@ def extraction_data_from_image(images):
|
|
| 415 |
images_list.append(images[i])
|
| 416 |
page_no_list.append(i)
|
| 417 |
num_pages_list.append(num_imgs)
|
| 418 |
-
# print("i - lines[i], row_indexes[i], par_boxes[i], line_boxes[i]",i,"-",lines[i], row_indexes[i], par_boxes[i], line_boxes[i])
|
| 419 |
-
# print("***************************************************************")
|
| 420 |
|
| 421 |
except:
|
| 422 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|
|
|
|
| 80 |
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
|
| 81 |
|
| 82 |
# max PDF page images that will be displayed
|
| 83 |
+
max_imgboxes = 2
|
| 84 |
examples_dir = 'files/'
|
| 85 |
image_wo_content = examples_dir + "wo_content.png" # image without content
|
| 86 |
pdf_blank = examples_dir + "blank.pdf" # blank PDF
|
|
|
|
| 366 |
if num_imgs > 0:
|
| 367 |
|
| 368 |
# https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
|
| 369 |
+
custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
|
|
|
|
| 370 |
results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
|
| 371 |
images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
|
| 372 |
|
|
|
|
| 380 |
img = np.array(img, dtype='uint8') # convert PIL to cv2
|
| 381 |
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
|
| 382 |
ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
# OCR PyTesseract | get langs of page
|
| 385 |
txt = pytesseract.image_to_string(img, config=custom_config)
|
|
|
|
| 386 |
# txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
| 387 |
+
|
| 388 |
try:
|
| 389 |
langs = detect_langs(txt)
|
| 390 |
langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
|
|
|
|
| 393 |
langs_string = "eng"
|
| 394 |
langs_string += '+osd'
|
| 395 |
custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
|
|
|
|
| 396 |
|
| 397 |
# OCR PyTesseract | get data
|
| 398 |
results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
|
|
|
|
| 399 |
# results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
|
|
|
|
| 400 |
|
|
|
|
| 401 |
lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
|
| 402 |
lines_list.append(lines[i])
|
| 403 |
par_boxes_list.append(par_boxes[i])
|
|
|
|
| 406 |
images_list.append(images[i])
|
| 407 |
page_no_list.append(i)
|
| 408 |
num_pages_list.append(num_imgs)
|
|
|
|
|
|
|
| 409 |
|
| 410 |
except:
|
| 411 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|