Spaces:

pierreguillou
/

Inference-APP-Document-Understanding-at-linelevel-v1

Runtime error

App Files Files Community

pierreguillou commited on Feb 13, 2023

Commit

723d054

1 Parent(s): d159597

Update files/functions.py

Browse files

Files changed (1) hide show

files/functions.py +3 -14

files/functions.py CHANGED Viewed

@@ -80,7 +80,7 @@ else:
 doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
 # max PDF page images that will be displayed
-max_imgboxes = 3
 examples_dir = 'files/'
 image_wo_content = examples_dir + "wo_content.png" # image without content
 pdf_blank = examples_dir + "blank.pdf" # blank PDF
@@ -366,8 +366,7 @@ def extraction_data_from_image(images):
     if num_imgs > 0:
         # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
-        custom_config = r'--oem 3 --psm 3 -l eng+por+spa' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
-        # custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3
         results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
         images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
@@ -381,15 +380,11 @@ def extraction_data_from_image(images):
                 img = np.array(img, dtype='uint8') # convert PIL to cv2
                 img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
                 ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
-                # img_filepath = f"img{i}.png"
-                # img.save(img_filepath)
-                # cv2.imwrite(img_filepath, img)
                 # OCR PyTesseract | get langs of page
                 txt = pytesseract.image_to_string(img, config=custom_config)
                 # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
                 try:
                     langs = detect_langs(txt)
                     langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
@@ -398,15 +393,11 @@ def extraction_data_from_image(images):
                     langs_string = "eng"
                 langs_string += '+osd'
                 custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
-                # print("langs", i, "-", langs_string)
                 # OCR PyTesseract | get data
                 results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
                 # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
-                # print("results[i].keys()", i, "-",results[i].keys())
-                # print("factor", factor)
                 lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
                 lines_list.append(lines[i])
                 par_boxes_list.append(par_boxes[i])
@@ -415,8 +406,6 @@ def extraction_data_from_image(images):
                 images_list.append(images[i])
                 page_no_list.append(i)
                 num_pages_list.append(num_imgs)
-                # print("i - lines[i], row_indexes[i], par_boxes[i], line_boxes[i]",i,"-",lines[i], row_indexes[i], par_boxes[i], line_boxes[i])
-                # print("***************************************************************")
         except:
             print(f"There was an error within the extraction of PDF text by the OCR!")

 doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
 # max PDF page images that will be displayed
+max_imgboxes = 2
 examples_dir = 'files/'
 image_wo_content = examples_dir + "wo_content.png" # image without content
 pdf_blank = examples_dir + "blank.pdf" # blank PDF
     if num_imgs > 0:
         # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
+        custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
         results, lines, row_indexes, par_boxes, line_boxes = dict(), dict(), dict(), dict(), dict()
         images_ids_list, lines_list, par_boxes_list, line_boxes_list, images_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list()
                 img = np.array(img, dtype='uint8') # convert PIL to cv2
                 img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
                 ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
                 # OCR PyTesseract | get langs of page
                 txt = pytesseract.image_to_string(img, config=custom_config)
                 # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
                 try:
                     langs = detect_langs(txt)
                     langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
                     langs_string = "eng"
                 langs_string += '+osd'
                 custom_config = f'--oem 3 --psm 3 -l {langs_string} tsv' # default config PyTesseract: --oem 3 --psm 3
                 # OCR PyTesseract | get data
                 results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
                 # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
                 lines[i], row_indexes[i], par_boxes[i], line_boxes[i] = get_data(results[i], factor, conf_min=0)
                 lines_list.append(lines[i])
                 par_boxes_list.append(par_boxes[i])
                 images_list.append(images[i])
                 page_no_list.append(i)
                 num_pages_list.append(num_imgs)
         except:
             print(f"There was an error within the extraction of PDF text by the OCR!")