Spaces:
Build error
Build error
| from typing import Tuple, List, Sequence, Optional, Union | |
| from torchvision import transforms | |
| from torch import nn, Tensor | |
| from PIL import Image | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup as bs | |
| from unitable import UnitablePredictor | |
| from doctrfiles import DoctrWordDetector,DoctrTextRecognizer | |
| import numpy as np | |
| from utils import crop_an_Image,cropImageExtraMargin | |
| from utils import denoisingAndSharpening | |
| import numpy.typing as npt | |
| from numpy import uint8 | |
| ImageType = npt.NDArray[uint8] | |
| html_table_template = ( | |
| lambda table: f"""<html> | |
| <head> <meta charset="UTF-8"> | |
| <style> | |
| table, th, td {{ | |
| border: 1px solid black; | |
| font-size: 10px; | |
| }} | |
| </style> </head> | |
| <body> | |
| <table frame="hsides" rules="groups" width="100%%"> | |
| {table} | |
| </table> </body> </html>""" | |
| ) | |
| class OcrTable1(): | |
| def __init__(self,englishFlag = True): | |
| self.wordDetector = DoctrWordDetector(architecture="db_resnet50", | |
| path_weights="./doctrfiles/models/db_resnet50-79bd7d70.pt", | |
| path_config_json ="./doctrfiles/models/db_resnet50_config.json") | |
| self.unitablePredictor = UnitablePredictor() | |
| if englishFlag: | |
| self.textRecognizer = DoctrTextRecognizer(architecture="master", path_weights="./doctrfiles/models/master-fde31e4a.pt", | |
| path_config_json="./doctrfiles/models/master.json") | |
| else: | |
| self.textRecognizer = DoctrTextRecognizer(architecture="parseq", path_weights="./doctrfiles/models/doctr-multilingual-parseq.bin", | |
| path_config_json="./doctrfiles/models/multilingual-parseq-config.json") | |
| def build_table_from_html_and_cell( | |
| structure: List[str], content: List[str] = None | |
| ) -> List[str]: | |
| """Build table from html and cell token list""" | |
| assert structure is not None | |
| html_code = list() | |
| # deal with empty table | |
| if content is None: | |
| content = ["placeholder"] * len(structure) | |
| for tag in structure: | |
| if tag in ("<td>[]</td>", ">[]</td>"): | |
| if len(content) == 0: | |
| continue | |
| cell = content.pop(0) | |
| html_code.append(tag.replace("[]", cell)) | |
| else: | |
| html_code.append(tag) | |
| return html_code | |
| def save_detection(detected_lines_images:List[ImageType],prefix = './res/test1/res_'): | |
| i = 0 | |
| for img in detected_lines_images: | |
| pilimg = Image.fromarray(img) | |
| pilimg.save(prefix+str(i)+'.png') | |
| i=i+1 | |
| def predict(self,images:List[ImageType],debug_folder="./res",denoise=False): | |
| """ | |
| this hardcodes 0 into images and bbxs cause they are made to get multiple images but this component will only get one image | |
| """ | |
| # Step 0 : Locate the table using Table detection TODO | |
| # PreProcessing | |
| if denoise: | |
| images = denoisingAndSharpening(images) | |
| else: | |
| images = images | |
| pred_htmls, bbxs = self.unitablePredictor.predict(images,debug_folder) | |
| #pred_html =['<thead>', '<tr>', '<td', ' ', 'colspan="8"', '>[]</td>', '</tr>', '<tr>', '<td', ' ', 'colspan="8"', '>[]</td>', '</tr>', '<tr>', '<td></td>', '<td', ' ', 'colspan="2"', '>[]</td>', '<td', ' ', 'colspan="2"', '>[]</td>', '<td', ' ', 'colspan="2"', '>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '</thead>', '<tbody>', '<tr>', '<td></td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '<tr>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '<td>[]</td>', '</tr>', '</tbody>'] | |
| #bbxs = [[608, 33, 820, 106], [72, 125, 1353, 212], [377, 255, 654, 340], [709, 255, 989, 340], [1044, 255, 1330, 340], [166, 364, 254, 394], [351, 451, 517, 484], [520, 424, 676, 538], [689, 451, 839, 484], [859, 424, 1011, 538], [1024, 424, 1181, 511], [1194, 424, 1353, 538], [420, 614, 446, 644], [592, 614, 615, 644], [761, 614, 784, 644], [930, 614, 953, 644], [1096, 614, 1119, 644], [1262, 614, 1285, 644], [72, 671, 185, 701], [315, 671, 351, 701], [394, 671, 462, 701], [595, 671, 631, 701], [728, 671, 797, 701], [930, 671, 966, 701], [1063, 671, 1132, 701], [1268, 671, 1304, 701], [72, 698, 205, 728], [315, 698, 351, 728], [416, 698, 462, 728], [589, 698, 631, 728], [748, 698, 790, 728], [924, 698, 966, 728], [1089, 698, 1132, 728], [1259, 698, 1304, 728], [72, 725, 208, 755], [315, 725, 351, 755], [416, 725, 462, 755], [595, 725, 631, 755], [751, 725, 797, 755], [930, 725, 966, 755], [1063, 725, 1135, 755], [1268, 725, 1304, 755], [72, 752, 211, 782], [315, 752, 351, 782], [416, 752, 462, 782], [595, 752, 631, 782], [764, 752, 797, 782], [946, 752, 966, 782], [1089, 752, 1132, 782], [1268, 752, 1304, 782], [72, 780, 179, 810], [315, 780, 351, 810], [416, 780, 462, 810], [595, 780, 631, 810], [764, 780, 797, 810], [946, 780, 966, 810], [1089, 780, 1132, 810], [1268, 780, 1304, 810], [72, 807, 182, 837], [315, 807, 351, 837], [416, 807, 462, 837], [595, 807, 631, 837], [751, 807, 797, 837], [946, 807, 966, 837], [1089, 807, 1132, 837], [1268, 807, 1304, 837], [72, 834, 169, 864], [315, 834, 351, 864], [416, 834, 462, 864], [595, 834, 631, 864], [764, 834, 797, 864], [946, 834, 966, 864], [1089, 834, 1132, 864], [1268, 834, 1304, 864], [72, 861, 189, 891], [315, 861, 351, 891], [416, 861, 462, 891], [595, 861, 631, 891], [764, 861, 797, 891], [946, 861, 966, 891], [1089, 861, 1132, 891], [1268, 861, 1304, 891], [72, 888, 189, 918], [315, 888, 351, 918], [416, 888, 462, 918], [595, 888, 631, 918], [751, 888, 797, 918], [946, 888, 966, 918], [1089, 888, 1132, 918], [1268, 888, 1304, 918], [72, 915, 179, 945], [315, 915, 351, 945], [416, 915, 462, 945], [595, 915, 631, 945], [764, 915, 797, 945], [946, 915, 966, 945], [1089, 915, 1132, 945], [1268, 915, 1304, 945], [72, 943, 241, 973], [315, 943, 351, 973], [416, 943, 462, 973], [595, 943, 631, 973], [764, 943, 797, 973], [946, 943, 966, 973], [1089, 943, 1132, 973], [1268, 943, 1304, 973], [72, 970, 231, 1000], [315, 970, 351, 1000], [394, 970, 462, 1000], [595, 970, 631, 1000], [751, 970, 797, 1000], [930, 970, 966, 1000], [1063, 970, 1132, 1000], [1268, 970, 1304, 1000], [72, 997, 211, 1027], [315, 997, 351, 1027], [416, 997, 462, 1027], [595, 997, 631, 1027], [764, 997, 797, 1027], [946, 997, 966, 1027], [1089, 997, 1132, 1027], [1268, 997, 1304, 1027], [72, 1024, 198, 1054], [315, 1024, 351, 1054], [394, 1024, 462, 1054], [595, 1024, 631, 1054], [764, 1024, 797, 1054], [946, 1024, 966, 1054], [1063, 1024, 1132, 1054], [1268, 1024, 1304, 1054], [72, 1051, 231, 1081], [315, 1051, 351, 1081], [394, 1051, 462, 1081], [595, 1051, 631, 1081], [764, 1051, 797, 1081], [946, 1051, 966, 1081], [1063, 1051, 1132, 1081], [1268, 1051, 1304, 1081], [124, 1108, 195, 1138], [315, 1108, 351, 1138], [381, 1108, 462, 1138], [595, 1108, 631, 1138], [728, 1108, 797, 1138], [946, 1108, 966, 1138], [1054, 1108, 1135, 1138], [1268, 1108, 1304, 1138]] | |
| #Step2: Crop the images from the returned bboxes | |
| pred_cell = [] | |
| cell_imgs_to_viz = [] | |
| cell_img_num=0 | |
| # Some tabless have a lot of words in their header | |
| # So for the headers, give doctr word ddetector doesn't work when the images aren't square | |
| table_header_cells = 0 | |
| header_exists = False | |
| for cell in pred_html: | |
| if cell=='>[]</td>' or cell == '<td>[]</td>': | |
| table_header_cells += 1 | |
| if cell =='</thead>': | |
| header_exists = True | |
| break | |
| if not header_exists: | |
| table_header_cells = 0 | |
| pred_cell = [] | |
| cell_imgs_to_viz = [] | |
| cell_img_num=0 | |
| one_line_height = 100000 | |
| for i in range(table_header_cells): | |
| box = bbxs[0][i] | |
| xmin, ymin, xmax, ymax = box | |
| current_box_height = abs(ymax-ymin) | |
| if current_box_height<one_line_height: | |
| one_line_height = current_box_height | |
| for box in bbxs[0]: | |
| xmin, ymin, xmax, ymax = box | |
| fourbytwo = np.array([ | |
| [xmin, ymin], | |
| [xmax, ymin], | |
| [xmax, ymax], | |
| [xmin, ymax] | |
| ], dtype=np.float32) | |
| current_box_height = abs(ymax-ymin) | |
| #THOSE ARE FOR THE Header cells THAT HAS A LOT OF WORDS | |
| if table_header_cells > 0 and current_box_height>one_line_height+5: | |
| cell_img = cropImageExtraMargin([fourbytwo],images[0])[0] | |
| table_header_cells -= 1 | |
| #List of 4 x 2 | |
| detection_results = self.wordDetector.predict(cell_img,sort_vertical=True) | |
| input_to_recog = [] | |
| if detection_results == []: | |
| input_to_recog.append(cell_img) | |
| else: | |
| #print("Debugging the issue") | |
| for wordbox in detection_results: | |
| #print(wordbox.box) | |
| #print(cell_img.shape) | |
| cropped_image= crop_an_Image(wordbox.box,cell_img) | |
| #print(cropped_image.shape) | |
| if cropped_image.shape[0] >0 and cropped_image.shape[1]>0: | |
| input_to_recog.append(cropped_image) | |
| else: | |
| print("Empty image") | |
| else:# For normal cells don't do word detection! | |
| cell_img = crop_an_Image(fourbytwo,images[0]) | |
| if table_header_cells>0: | |
| table_header_cells -= 1 | |
| if cell_img.shape[0] >0 and cell_img.shape[1]>0: | |
| input_to_recog =[cell_img] | |
| cell_imgs_to_viz.append(cell_img) | |
| cell_img_num = cell_img_num+1 | |
| if input_to_recog != []: | |
| words = self.textRecognizer.predict_for_tables(input_to_recog) | |
| cell_output = " ".join(words) | |
| pred_cell.append(cell_output) | |
| else: | |
| #Don't lose empty cell | |
| pred_cell.append("") | |
| self.save_detection(cell_imgs_to_viz,prefix = './res/test1/cell_imgs_') | |
| print(pred_cell) | |
| #Step3 : | |
| pred_html = pred_htmls[0] | |
| pred_code = self.build_table_from_html_and_cell(pred_html, pred_cell) | |
| print(pred_code) | |
| pred_code = "".join(pred_code) | |
| pred_code = html_table_template(pred_code) | |
| # Display the HTML table | |
| soup = bs(pred_code) | |
| #formatted and indented) string representation of the HTML document | |
| table_code = soup.prettify() | |
| print(table_code) | |
| return table_code | |