Spaces:
Runtime error
Runtime error
uncomment table detection
Browse files
app.py
CHANGED
|
@@ -640,61 +640,60 @@ class TableExtractionPipeline():
|
|
| 640 |
|
| 641 |
caption_ocr_res = await asyncio.gather(*sequential_caption_img_list)
|
| 642 |
flag_caption_pos = 0 # 0=top, 1=bottom
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
break
|
| 648 |
|
| 649 |
for idx, caption_text in enumerate(caption_ocr_res):
|
| 650 |
if idx%2==flag_caption_pos:
|
| 651 |
c3.text(str(idx) + "_" + caption_text)
|
| 652 |
|
| 653 |
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
# except:
|
| 699 |
# st.write('Either incorrectly identified table or no table, to debug remove try/except')
|
| 700 |
# break
|
|
|
|
| 640 |
|
| 641 |
caption_ocr_res = await asyncio.gather(*sequential_caption_img_list)
|
| 642 |
flag_caption_pos = 0 # 0=top, 1=bottom
|
| 643 |
+
if "table" in caption_ocr_res[0].lower() or "表" in caption_ocr_res[0]:
|
| 644 |
+
flag_caption_pos=0
|
| 645 |
+
else:
|
| 646 |
+
flag_caption_pos=1
|
|
|
|
| 647 |
|
| 648 |
for idx, caption_text in enumerate(caption_ocr_res):
|
| 649 |
if idx%2==flag_caption_pos:
|
| 650 |
c3.text(str(idx) + "_" + caption_text)
|
| 651 |
|
| 652 |
|
| 653 |
+
for idx, unpadded_table in enumerate(cropped_img_list):
|
| 654 |
+
|
| 655 |
+
table = self.add_padding(unpadded_table, padd_top, padd_right,
|
| 656 |
+
padd_bottom, padd_left)
|
| 657 |
+
# table = super_res(table)
|
| 658 |
+
# table = binarizeBlur_image(table)
|
| 659 |
+
# table = sharpen_image(table) # Test sharpen image next
|
| 660 |
+
# table = td_postprocess(table)
|
| 661 |
+
|
| 662 |
+
# table.save("result"+str(idx)+".png")
|
| 663 |
+
|
| 664 |
+
probas, bboxes_scaled = table_struct_recog(
|
| 665 |
+
table, THRESHOLD_PROBA=TSR_THRESHOLD)
|
| 666 |
+
rows, cols = self.generate_structure(c2, table_recognition_model,
|
| 667 |
+
table, probas, bboxes_scaled,
|
| 668 |
+
expand_rowcol_bbox_top,
|
| 669 |
+
expand_rowcol_bbox_bottom)
|
| 670 |
+
# st.write(len(rows), len(cols))
|
| 671 |
+
rows, cols = self.sort_table_featuresv2(rows, cols)
|
| 672 |
+
master_row, cols = self.individual_table_featuresv2(
|
| 673 |
+
table, rows, cols)
|
| 674 |
+
|
| 675 |
+
cells_img, max_cols, max_rows = self.object_to_cellsv2(
|
| 676 |
+
master_row, cols, expand_rowcol_bbox_top,
|
| 677 |
+
expand_rowcol_bbox_bottom, padd_left)
|
| 678 |
+
|
| 679 |
+
sequential_cell_img_list = []
|
| 680 |
+
for k, img_list in cells_img.items():
|
| 681 |
+
for img in img_list:
|
| 682 |
+
# img = super_res(img)
|
| 683 |
+
# img = sharpen_image(img) # Test sharpen image next
|
| 684 |
+
# img = binarizeBlur_image(img)
|
| 685 |
+
# img = self.add_padding(img, 10,10,10,10)
|
| 686 |
+
# plt.imshow(img)
|
| 687 |
+
# c3.pyplot()
|
| 688 |
+
sequential_cell_img_list.append(
|
| 689 |
+
pytess(cell_pil_img=img, threshold=OCR_THRESHOLD))
|
| 690 |
+
|
| 691 |
+
cell_ocr_res = await asyncio.gather(*sequential_cell_img_list)
|
| 692 |
+
|
| 693 |
+
self.create_dataframe(c3, cell_ocr_res, max_cols, max_rows)
|
| 694 |
+
st.write(
|
| 695 |
+
'Errors in OCR is due to either quality of the image or performance of the OCR'
|
| 696 |
+
)
|
| 697 |
# except:
|
| 698 |
# st.write('Either incorrectly identified table or no table, to debug remove try/except')
|
| 699 |
# break
|