| |
| """ |
| JSON Table to XML Converter |
| Processes JSON files containing table data and corresponding PNG images |
| to create cropped sub-table images and XML coordinate files for ALL tables found. |
| """ |
|
|
| import json |
| import xml.etree.ElementTree as ET |
| from xml.dom import minidom |
| import os |
| from typing import Dict, List, Tuple, Any, Optional |
|
|
| class TableProcessor: |
| """Main class for processing table data from JSON to XML with image cropping""" |
| |
| def __init__(self, padding_ratio: float = 0.05): |
| """ |
| Initialize the table processor |
| |
| Args: |
| padding_ratio: Padding around table as ratio of min(width, height) |
| """ |
| self.padding_ratio = padding_ratio |
| self.DEFAULT_WIDTH = 100 |
| self.DEFAULT_HEIGHT = 30 |
| |
| def extract_tables_from_json(self, json_data: Any) -> List[Dict]: |
| """ |
| Extract all table items from JSON data |
| |
| Args: |
| json_data: Parsed JSON data (dict or list) |
| |
| Returns: |
| List of table dictionaries |
| """ |
| if isinstance(json_data, list): |
| |
| tables = [item for item in json_data if item.get("type") == "table"] |
| elif isinstance(json_data, dict) and json_data.get("type") == "table": |
| |
| tables = [json_data] |
| else: |
| tables = [] |
| |
| return tables |
| |
| def calculate_cell_coordinates(self, table_properties: Dict, table_x: float, table_y: float) -> Dict[Tuple[int, int], Dict]: |
| """ |
| Calculate coordinates for all visible cells in the table |
| |
| Args: |
| table_properties: Table properties from JSON |
| table_x: Table X position in original image |
| table_y: Table Y position in original image |
| |
| Returns: |
| Dictionary mapping (row, col) to coordinate info |
| """ |
| rows = table_properties.get("rows", 0) |
| columns = table_properties.get("columns", 0) |
| column_widths = table_properties.get("columnWidths", {}) |
| row_heights = table_properties.get("rowHeights", {}) |
| merged_cells = table_properties.get("mergedCells", {}) |
| hidden_cells = table_properties.get("hiddenCells", {}) |
|
|
| def get_col_width(col: int) -> int: |
| return column_widths.get(str(col), self.DEFAULT_WIDTH) |
|
|
| def get_row_height(row: int) -> int: |
| return row_heights.get(str(row), self.DEFAULT_HEIGHT) |
|
|
| |
| merged_spanned_cells = set() |
| for cell_key, merge_info in merged_cells.items(): |
| base_row, base_col = map(int, cell_key.split('-')) |
| rowspan = merge_info.get('rowspan', 1) |
| colspan = merge_info.get('colspan', 1) |
| |
| |
| for r in range(base_row, base_row + rowspan): |
| for c in range(base_col, base_col + colspan): |
| if (r, c) != (base_row, base_col): |
| merged_spanned_cells.add((r, c)) |
|
|
| cell_coords = {} |
|
|
| for row in range(rows): |
| for col in range(columns): |
| cell_key = f"{row}-{col}" |
| |
| |
| if hidden_cells.get(cell_key) or (row, col) in merged_spanned_cells: |
| continue |
|
|
| |
| x = sum(get_col_width(c) for c in range(col)) |
| y = sum(get_row_height(r) for r in range(row)) |
|
|
| |
| if cell_key in merged_cells: |
| merge_info = merged_cells[cell_key] |
| colspan = merge_info.get("colspan", 1) |
| rowspan = merge_info.get("rowspan", 1) |
| else: |
| colspan = 1 |
| rowspan = 1 |
|
|
| |
| width = sum(get_col_width(c) for c in range(col, col + colspan)) |
| height = sum(get_row_height(r) for r in range(row, row + rowspan)) |
|
|
| |
| cell_coords[(row, col)] = { |
| "x": (x + table_x), |
| "y": (y + table_y), |
| "width": width, |
| "height": height, |
| "colspan": colspan, |
| "rowspan": rowspan |
| } |
|
|
| return cell_coords |
| |
| def determine_cell_borders(self, cell_data: Optional[Dict], table_properties: Dict) -> Tuple[int, int, int, int]: |
| """ |
| Determine border visibility for each side of a cell |
| |
| Args: |
| cell_data: Individual cell data from JSON |
| table_properties: Global table properties |
| |
| Returns: |
| Tuple of (top, bottom, left, right) border flags (0 or 1) |
| """ |
| |
| cell_borders = table_properties.get("cellBorders", {}) |
| has_global_borders = cell_borders.get("all", False) |
| |
| |
| borders = { |
| "top": 1 if has_global_borders else 0, |
| "bottom": 1 if has_global_borders else 0, |
| "left": 1 if has_global_borders else 0, |
| "right": 1 if has_global_borders else 0 |
| } |
| |
| |
| if cell_data and "cellStyle" in cell_data: |
| cell_style = cell_data["cellStyle"] |
| |
| |
| border_mappings = { |
| "borderTopWidth": "top", |
| "borderBottomWidth": "bottom", |
| "borderLeftWidth": "left", |
| "borderRightWidth": "right" |
| } |
| |
| |
| has_custom_borders = any(key in cell_style for key in border_mappings.keys()) |
| |
| if has_custom_borders: |
| |
| for width_key, border_side in border_mappings.items(): |
| if width_key in cell_style: |
| |
| width = cell_style[width_key] |
| has_border = width > 0 |
| |
| |
| style_key = width_key.replace("Width", "Style") |
| if style_key in cell_style: |
| style = cell_style[style_key] |
| if style == "none": |
| has_border = False |
| |
| borders[border_side] = 1 if has_border else 0 |
| |
| return borders["top"], borders["bottom"], borders["left"], borders["right"] |
| |
| def convert_table_to_xml(self, table_data: Dict, output_filename: str) -> Tuple[ET.Element, Dict]: |
| """ |
| Convert a single table to XML format with crop information |
| |
| Args: |
| table_data: Single table data from JSON |
| output_filename: Filename to reference in XML |
| |
| Returns: |
| Tuple of (XML root element, crop info dictionary) |
| """ |
| |
| properties = table_data.get("properties", {}) |
| table_x = table_data.get("x", 0) |
| table_y = table_data.get("y", 0) |
| table_width = table_data.get("width", properties.get("width", 0)) |
| table_height = table_data.get("height", properties.get("height", 0)) |
| |
| |
| min_dimension = min(table_width, table_height) |
| padding = int(min_dimension * self.padding_ratio) |
| |
| |
| crop_x = table_x - padding |
| crop_y = table_y - padding |
| crop_width = table_width + (2 * padding) |
| crop_height = table_height + (2 * padding) |
| |
| |
| root = ET.Element("document", filename=output_filename) |
| table_elem = ET.SubElement(root, "table") |
| |
| |
| table_x_in_crop = padding |
| table_y_in_crop = padding |
| table_coords = f"{table_x_in_crop},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop + table_height} {table_x_in_crop},{table_y_in_crop + table_height}" |
| ET.SubElement(table_elem, "Coords", points=table_coords) |
| |
| |
| cell_coords = self.calculate_cell_coordinates(properties, table_x, table_y) |
| cell_data = properties.get("cellData", {}) |
| merged_cells = properties.get("mergedCells", {}) |
| |
| |
| for (row, col), coords in cell_coords.items(): |
| cell_key = f"{row}-{col}" |
| current_cell_data = cell_data.get(cell_key, {}) |
| |
| |
| end_row = row + coords["rowspan"] - 1 |
| end_col = col + coords["colspan"] - 1 |
| |
| |
| cell_elem = ET.SubElement(table_elem, "cell") |
| cell_elem.set("start-row", str(row)) |
| cell_elem.set("end-row", str(end_row)) |
| cell_elem.set("start-col", str(col)) |
| cell_elem.set("end-col", str(end_col)) |
| |
| |
| original_x1 = int(coords["x"]) |
| original_y1 = int(coords["y"]) |
| original_x2 = int(coords["x"] + coords["width"]) |
| original_y2 = int(coords["y"] + coords["height"]) |
| |
| |
| crop_x1 = original_x1 - int( crop_x) |
| crop_y1 = original_y1 - int( crop_y) |
| crop_x2 = original_x2 - int( crop_x) |
| crop_y2 = original_y2 - int( crop_y) |
| |
| cell_coords_str = f"{crop_x1},{crop_y1} {crop_x2},{crop_y1} {crop_x2},{crop_y2} {crop_x1},{crop_y2}" |
| ET.SubElement(cell_elem, "Coords", points=cell_coords_str) |
| |
| |
| top, bottom, left, right = self.determine_cell_borders(current_cell_data, properties) |
| ET.SubElement(cell_elem, "Lines", |
| top=str(top), |
| bottom=str(bottom), |
| left=str(left), |
| right=str(right)) |
| |
| |
| crop_info = { |
| "crop_x": crop_x, |
| "crop_y": crop_y, |
| "crop_width": crop_width, |
| "crop_height": crop_height, |
| "padding": padding, |
| "table_id": table_data.get("id", "unknown") |
| } |
| |
| return root, crop_info |
| |
| def save_xml(self, xml_root: ET.Element, output_path: str) -> bool: |
| """ |
| Save XML to file with pretty formatting |
| |
| Args: |
| xml_root: XML root element |
| output_path: Path to save XML file |
| |
| Returns: |
| True if successful, False otherwise |
| """ |
| try: |
| |
| rough_string = ET.tostring(xml_root, encoding='unicode') |
| reparsed = minidom.parseString(rough_string) |
| pretty_xml = reparsed.toprettyxml(indent=" ") |
| |
| |
| lines = [line for line in pretty_xml.split('\n') if line.strip()] |
| pretty_xml = '\n'.join(lines) |
| |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| f.write(pretty_xml) |
| |
| return True |
| except Exception as e: |
| print(f"β Error saving XML to {output_path}: {e}") |
| return False |
| |
| def crop_image(self, image_path: str, crop_info: Dict, output_path: str) -> bool: |
| """ |
| Crop image based on crop information |
| |
| Args: |
| image_path: Path to original image |
| crop_info: Crop information dictionary |
| output_path: Path to save cropped image |
| |
| Returns: |
| True if successful, False otherwise |
| """ |
| try: |
| from PIL import Image |
| |
| with Image.open(image_path) as img: |
| |
| left = max(0, int(crop_info['crop_x'])) |
| top = max(0, int(crop_info['crop_y'])) |
| right = min(img.width, int(crop_info['crop_x'] + crop_info['crop_width'])) |
| bottom = min(img.height, int(crop_info['crop_y'] + crop_info['crop_height'])) |
| |
| |
| cropped_img = img.crop((left, top, right, bottom)) |
| cropped_img.save(output_path) |
| |
| return True |
| |
| except ImportError: |
| print("β PIL/Pillow not installed. Run: pip install Pillow") |
| return False |
| except Exception as e: |
| print(f"β Error cropping image: {e}") |
| return False |
| |
| def generate_output_filenames(self, base_name: str, table_index: int, table_id: str, total_tables: int, output_dir: str) -> Tuple[str, str, str]: |
| """ |
| Generate appropriate output filenames for XML and image files |
| |
| Args: |
| base_name: Base filename without extension |
| table_index: Index of current table |
| table_id: ID of the table from JSON |
| total_tables: Total number of tables in the file |
| output_dir: Output directory |
| |
| Returns: |
| Tuple of (xml_path, image_path, image_filename_for_xml) |
| """ |
| if total_tables > 1: |
| |
| clean_table_id = table_id.replace('/', '_').replace('\\', '_') |
| xml_filename = f"{base_name}_table_{table_index}_{clean_table_id}.xml" |
| image_filename = f"{base_name}_table_{table_index}_{clean_table_id}.png" |
| else: |
| |
| xml_filename = f"{base_name}.xml" |
| image_filename = f"{base_name}_cropped.png" |
| |
| xml_path = os.path.join(output_dir, xml_filename) |
| image_path = os.path.join(output_dir, image_filename) |
| |
| return xml_path, image_path, image_filename |
| |
| def process_single_file(self, json_path: str, image_path: str, output_dir: str = "output") -> int: |
| """ |
| Process a single JSON+PNG file pair to extract all tables |
| |
| Args: |
| json_path: Path to JSON file |
| image_path: Path to PNG image file |
| output_dir: Directory for output files |
| |
| Returns: |
| Number of tables successfully processed |
| """ |
| try: |
| |
| os.makedirs(output_dir, exist_ok=True) |
| |
| |
| with open(json_path, 'r', encoding='utf-8') as f: |
| json_data = json.load(f) |
| json_data = json_data.get('items') |
| |
| tables = self.extract_tables_from_json(json_data) |
| |
| if not tables: |
| print(f"β No tables found in {json_path}") |
| return 0 |
| |
| print(f"π Found {len(tables)} table(s) in {json_path}") |
| |
| base_name = os.path.splitext(os.path.basename(json_path))[0] |
| successful_count = 0 |
| |
| |
| for table_index, table_data in enumerate(tables): |
| try: |
| table_id = table_data.get('id', f'table_{table_index}') |
| print(f" π Processing table {table_index + 1}/{len(tables)} (id: {table_id})") |
| |
| |
| xml_path, image_output_path, image_filename = self.generate_output_filenames( |
| base_name, table_index, table_id, len(tables), output_dir |
| ) |
| |
| |
| xml_root, crop_info = self.convert_table_to_xml(table_data, image_filename) |
| |
| |
| if not self.save_xml(xml_root, xml_path): |
| continue |
| |
| |
| if not self.crop_image(image_path, crop_info, image_output_path): |
| continue |
| |
| print(f" β
Table {table_index + 1} completed:") |
| print(f" π XML: {xml_path}") |
| print(f" πΌοΈ Image: {image_output_path}") |
| print(f" π Padding: {crop_info['padding']}px ({self.padding_ratio:.1%})") |
| |
| successful_count += 1 |
| |
| except Exception as e: |
| print(f" β Error processing table {table_index + 1}: {e}") |
| continue |
| |
| print(f"β
Successfully processed {successful_count}/{len(tables)} tables from {json_path}") |
| return successful_count |
| |
| except Exception as e: |
| print(f"β Error processing file {json_path}: {e}") |
| return 0 |
| |
| def process_batch(self, input_dir: str, output_dir: str = "output") -> int: |
| """ |
| Batch process all JSON+PNG pairs in a directory |
| |
| Args: |
| input_dir: Directory containing JSON and PNG files |
| output_dir: Directory for output files |
| |
| Returns: |
| Total number of tables processed across all files |
| """ |
| try: |
| |
| json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')] |
| |
| if not json_files: |
| print(f"β No JSON files found in {input_dir}") |
| return 0 |
| |
| print(f"ποΈ Found {len(json_files)} JSON files to process") |
| |
| total_tables = 0 |
| files_processed = 0 |
| |
| for json_file in json_files: |
| |
| base_name = os.path.splitext(json_file)[0] |
| png_file = f"{base_name}.png" |
| |
| json_path = os.path.join(input_dir, json_file) |
| png_path = os.path.join(input_dir, png_file) |
| |
| if os.path.exists(png_path): |
| print(f"\nπ Processing file pair: {base_name}") |
| tables_count = self.process_single_file(json_path, png_path, output_dir) |
| if tables_count > 0: |
| total_tables += tables_count |
| files_processed += 1 |
| else: |
| print(f"β οΈ Warning: No corresponding PNG file found for {json_file}") |
| |
| print(f"\nπ Batch processing completed!") |
| print(f" π Files processed: {files_processed}/{len(json_files)}") |
| print(f" π Total tables processed: {total_tables}") |
| |
| return total_tables |
| |
| except Exception as e: |
| print(f"β Error in batch processing: {e}") |
| return 0 |
|
|
|
|
| def main(): |
| """Main function with usage examples""" |
| |
| |
| processor = TableProcessor(padding_ratio=0.02) |
| |
| print("π§ JSON Table to XML Converter") |
| print("=" * 50) |
| |
| |
| print("\nπ Usage Examples:") |
| print("1. Single file (all tables):") |
| print(" processor.process_single_file('page1.json', 'page1.png', 'output')") |
| |
| print("\n2. Batch processing (all files, all tables):") |
| print(" processor.process_batch('input_folder', 'output_folder')") |
| |
| print("\n3. Custom padding:") |
| print(" processor = TableProcessor(padding_ratio=0.08) # 8% padding") |
| processor.process_batch('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925', 'output_folder') |
| |
| if __name__ == "__main__": |
| main() |