Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re # regex | |
| import numpy as np | |
| import PIL.Image as Image | |
| from paddleocr import PPStructure | |
| import html_to_json | |
| class TableEx: | |
| def __init__(self): | |
| self.table_engine = PPStructure(lang='en', layout=False, show_log=True, use_gpu=False, download_models=True, rec=True) | |
| def extract_table_information(self, pil_image : np.array): | |
| #img_byte_arr = toBytes(pil_image) | |
| #table_engine = PPStructure(lang='en', recovery=True, ocr=True, show_log=True, mode='kie') | |
| result = self.table_engine(pil_image) | |
| try: | |
| extracted_tables = html_to_json.convert_tables(result[0]['res']['html']) | |
| extracted_tables = self.remove_empty_elements(extracted_tables) | |
| except Exception as e: | |
| print('Structure extraction Failed, using fallback plain text.') | |
| x = [x['text'] for x in result[0]['res']] | |
| extracted_tables = ' '.join(x) | |
| return extracted_tables | |
| def remove_empty_elements(self, nested_list): | |
| """ | |
| Recursively removes empty elements from a nested list. | |
| """ | |
| cleaned_list = [] | |
| for item in nested_list: | |
| if isinstance(item, list): | |
| # Recurse into sublists | |
| cleaned_sublist = self.remove_empty_elements(item) | |
| if cleaned_sublist: | |
| cleaned_list.append(cleaned_sublist) | |
| elif item != '': | |
| # Add non-empty items to the cleaned list | |
| cleaned_list.append(item) | |
| return cleaned_list | |
| def extract_table_data(self, img_array, x1, y1, x2, y2): | |
| # Crop the detected table region | |
| table_region = img_array[max(0, y1):min(img_array.shape[0], y2), | |
| max(0, x1):min(img_array.shape[1], x2)] | |
| if table_region.size > 0 and table_region.shape[0] > 0 and table_region.shape[1] > 0: | |
| try: | |
| # Save the table image for display | |
| table_images = Image.fromarray(table_region) | |
| # Extract table data | |
| extracted_info = self.extract_table_information(table_region) | |
| # Store the extracted data with position info | |
| table_data = extracted_info[0] | |
| except Exception as e: | |
| print(f"Error extracting table data: {e}") | |
| table_data = { | |
| "region": f"({x1}, {y1}) to ({x2}, {y2})", | |
| "error": str(e), | |
| "data": None | |
| } | |
| return table_images, table_data |