#!/usr/bin/env python3 """ JSON Table to XML Converter Processes JSON files containing table data and corresponding PNG images to create cropped sub-table images and XML coordinate files for ALL tables found. """ import json import xml.etree.ElementTree as ET from xml.dom import minidom import os from typing import Dict, List, Tuple, Any, Optional class TableProcessor: """Main class for processing table data from JSON to XML with image cropping""" def __init__(self, padding_ratio: float = 0.05): """ Initialize the table processor Args: padding_ratio: Padding around table as ratio of min(width, height) """ self.padding_ratio = padding_ratio self.DEFAULT_WIDTH = 100 self.DEFAULT_HEIGHT = 30 def extract_tables_from_json(self, json_data: Any) -> List[Dict]: """ Extract all table items from JSON data Args: json_data: Parsed JSON data (dict or list) Returns: List of table dictionaries """ if isinstance(json_data, list): # Filter items with type="table" tables = [item for item in json_data if item.get("type") == "table"] elif isinstance(json_data, dict) and json_data.get("type") == "table": # Single table item tables = [json_data] else: tables = [] return tables def calculate_cell_coordinates(self, table_properties: Dict, table_x: float, table_y: float) -> Dict[Tuple[int, int], Dict]: """ Calculate coordinates for all visible cells in the table Args: table_properties: Table properties from JSON table_x: Table X position in original image table_y: Table Y position in original image Returns: Dictionary mapping (row, col) to coordinate info """ rows = table_properties.get("rows", 0) columns = table_properties.get("columns", 0) column_widths = table_properties.get("columnWidths", {}) row_heights = table_properties.get("rowHeights", {}) merged_cells = table_properties.get("mergedCells", {}) hidden_cells = table_properties.get("hiddenCells", {}) def get_col_width(col: int) -> int: return column_widths.get(str(col), self.DEFAULT_WIDTH) def get_row_height(row: int) -> int: return row_heights.get(str(row), self.DEFAULT_HEIGHT) # Build set of cells that are covered by merged cells (excluding origin) merged_spanned_cells = set() for cell_key, merge_info in merged_cells.items(): base_row, base_col = map(int, cell_key.split('-')) rowspan = merge_info.get('rowspan', 1) colspan = merge_info.get('colspan', 1) # Add all spanned cells except the origin cell for r in range(base_row, base_row + rowspan): for c in range(base_col, base_col + colspan): if (r, c) != (base_row, base_col): merged_spanned_cells.add((r, c)) cell_coords = {} for row in range(rows): for col in range(columns): cell_key = f"{row}-{col}" # Skip hidden cells and cells covered by merges if hidden_cells.get(cell_key) or (row, col) in merged_spanned_cells: continue # Calculate position by summing previous column widths/row heights x = sum(get_col_width(c) for c in range(col)) y = sum(get_row_height(r) for r in range(row)) # Check if this cell is a merge origin if cell_key in merged_cells: merge_info = merged_cells[cell_key] colspan = merge_info.get("colspan", 1) rowspan = merge_info.get("rowspan", 1) else: colspan = 1 rowspan = 1 # Calculate cell dimensions width = sum(get_col_width(c) for c in range(col, col + colspan)) height = sum(get_row_height(r) for r in range(row, row + rowspan)) # Store coordinates (with 2x scaling factor from original code) cell_coords[(row, col)] = { "x": (x + table_x), "y": (y + table_y), "width": width, "height": height, "colspan": colspan, "rowspan": rowspan } return cell_coords def determine_cell_borders(self, cell_data: Optional[Dict], table_properties: Dict) -> Tuple[int, int, int, int]: """ Determine border visibility for each side of a cell Args: cell_data: Individual cell data from JSON table_properties: Global table properties Returns: Tuple of (top, bottom, left, right) border flags (0 or 1) """ # Get global border settings cell_borders = table_properties.get("cellBorders", {}) has_global_borders = cell_borders.get("all", False) # Default borders based on global setting borders = { "top": 1 if has_global_borders else 0, "bottom": 1 if has_global_borders else 0, "left": 1 if has_global_borders else 0, "right": 1 if has_global_borders else 0 } # Check for cell-specific border overrides if cell_data and "cellStyle" in cell_data: cell_style = cell_data["cellStyle"] # Border property mappings border_mappings = { "borderTopWidth": "top", "borderBottomWidth": "bottom", "borderLeftWidth": "left", "borderRightWidth": "right" } # If any border width property exists, this cell has custom borders has_custom_borders = any(key in cell_style for key in border_mappings.keys()) if has_custom_borders: # Apply custom border settings for each side for width_key, border_side in border_mappings.items(): if width_key in cell_style: # Check border width width = cell_style[width_key] has_border = width > 0 # Check border style if specified style_key = width_key.replace("Width", "Style") if style_key in cell_style: style = cell_style[style_key] if style == "none": has_border = False borders[border_side] = 1 if has_border else 0 return borders["top"], borders["bottom"], borders["left"], borders["right"] def convert_table_to_xml(self, table_data: Dict, output_filename: str) -> Tuple[ET.Element, Dict]: """ Convert a single table to XML format with crop information Args: table_data: Single table data from JSON output_filename: Filename to reference in XML Returns: Tuple of (XML root element, crop info dictionary) """ # Extract table properties properties = table_data.get("properties", {}) table_x = table_data.get("x", 0) table_y = table_data.get("y", 0) table_width = table_data.get("width", properties.get("width", 0)) table_height = table_data.get("height", properties.get("height", 0)) # Calculate padding based on table dimensions min_dimension = min(table_width, table_height) padding = int(min_dimension * self.padding_ratio) # Calculate crop area crop_x = table_x - padding crop_y = table_y - padding crop_width = table_width + (2 * padding) crop_height = table_height + (2 * padding) # Create XML structure root = ET.Element("document", filename=output_filename) table_elem = ET.SubElement(root, "table") # Add table coordinates relative to cropped image table_x_in_crop = padding table_y_in_crop = padding table_coords = f"{table_x_in_crop},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop + table_height} {table_x_in_crop},{table_y_in_crop + table_height}" ET.SubElement(table_elem, "Coords", points=table_coords) # Get cell coordinates and data cell_coords = self.calculate_cell_coordinates(properties, table_x, table_y) cell_data = properties.get("cellData", {}) merged_cells = properties.get("mergedCells", {}) # Create XML elements for each cell for (row, col), coords in cell_coords.items(): cell_key = f"{row}-{col}" current_cell_data = cell_data.get(cell_key, {}) # Determine cell span (for merged cells) end_row = row + coords["rowspan"] - 1 end_col = col + coords["colspan"] - 1 # Create cell element cell_elem = ET.SubElement(table_elem, "cell") cell_elem.set("start-row", str(row)) cell_elem.set("end-row", str(end_row)) cell_elem.set("start-col", str(col)) cell_elem.set("end-col", str(end_col)) # Convert coordinates to cropped image space original_x1 = int(coords["x"]) original_y1 = int(coords["y"]) original_x2 = int(coords["x"] + coords["width"]) original_y2 = int(coords["y"] + coords["height"]) # Transform to cropped coordinates crop_x1 = original_x1 - int( crop_x) crop_y1 = original_y1 - int( crop_y) crop_x2 = original_x2 - int( crop_x) crop_y2 = original_y2 - int( crop_y) cell_coords_str = f"{crop_x1},{crop_y1} {crop_x2},{crop_y1} {crop_x2},{crop_y2} {crop_x1},{crop_y2}" ET.SubElement(cell_elem, "Coords", points=cell_coords_str) # Add border information top, bottom, left, right = self.determine_cell_borders(current_cell_data, properties) ET.SubElement(cell_elem, "Lines", top=str(top), bottom=str(bottom), left=str(left), right=str(right)) # Prepare crop information crop_info = { "crop_x": crop_x, "crop_y": crop_y, "crop_width": crop_width, "crop_height": crop_height, "padding": padding, "table_id": table_data.get("id", "unknown") } return root, crop_info def save_xml(self, xml_root: ET.Element, output_path: str) -> bool: """ Save XML to file with pretty formatting Args: xml_root: XML root element output_path: Path to save XML file Returns: True if successful, False otherwise """ try: # Convert to pretty-formatted string rough_string = ET.tostring(xml_root, encoding='unicode') reparsed = minidom.parseString(rough_string) pretty_xml = reparsed.toprettyxml(indent=" ") # Clean up extra whitespace lines lines = [line for line in pretty_xml.split('\n') if line.strip()] pretty_xml = '\n'.join(lines) # Write to file with open(output_path, 'w', encoding='utf-8') as f: f.write(pretty_xml) return True except Exception as e: print(f"āŒ Error saving XML to {output_path}: {e}") return False def crop_image(self, image_path: str, crop_info: Dict, output_path: str) -> bool: """ Crop image based on crop information Args: image_path: Path to original image crop_info: Crop information dictionary output_path: Path to save cropped image Returns: True if successful, False otherwise """ try: from PIL import Image with Image.open(image_path) as img: # Ensure crop coordinates are within image bounds left = max(0, int(crop_info['crop_x'])) top = max(0, int(crop_info['crop_y'])) right = min(img.width, int(crop_info['crop_x'] + crop_info['crop_width'])) bottom = min(img.height, int(crop_info['crop_y'] + crop_info['crop_height'])) # Crop and save cropped_img = img.crop((left, top, right, bottom)) cropped_img.save(output_path) return True except ImportError: print("āŒ PIL/Pillow not installed. Run: pip install Pillow") return False except Exception as e: print(f"āŒ Error cropping image: {e}") return False def generate_output_filenames(self, base_name: str, table_index: int, table_id: str, total_tables: int, output_dir: str) -> Tuple[str, str, str]: """ Generate appropriate output filenames for XML and image files Args: base_name: Base filename without extension table_index: Index of current table table_id: ID of the table from JSON total_tables: Total number of tables in the file output_dir: Output directory Returns: Tuple of (xml_path, image_path, image_filename_for_xml) """ if total_tables > 1: # Multiple tables: add index and ID to filename clean_table_id = table_id.replace('/', '_').replace('\\', '_') # Clean ID for filename xml_filename = f"{base_name}_table_{table_index}_{clean_table_id}.xml" image_filename = f"{base_name}_table_{table_index}_{clean_table_id}.png" else: # Single table: use simple filename xml_filename = f"{base_name}.xml" image_filename = f"{base_name}_cropped.png" xml_path = os.path.join(output_dir, xml_filename) image_path = os.path.join(output_dir, image_filename) return xml_path, image_path, image_filename def process_single_file(self, json_path: str, image_path: str, output_dir: str = "output") -> int: """ Process a single JSON+PNG file pair to extract all tables Args: json_path: Path to JSON file image_path: Path to PNG image file output_dir: Directory for output files Returns: Number of tables successfully processed """ try: # Create output directory os.makedirs(output_dir, exist_ok=True) # Read and parse JSON with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) json_data = json_data.get('items') # Extract all tables tables = self.extract_tables_from_json(json_data) if not tables: print(f"āŒ No tables found in {json_path}") return 0 print(f"šŸ“‹ Found {len(tables)} table(s) in {json_path}") base_name = os.path.splitext(os.path.basename(json_path))[0] successful_count = 0 # Process each table for table_index, table_data in enumerate(tables): try: table_id = table_data.get('id', f'table_{table_index}') print(f" šŸ”„ Processing table {table_index + 1}/{len(tables)} (id: {table_id})") # Generate filenames xml_path, image_output_path, image_filename = self.generate_output_filenames( base_name, table_index, table_id, len(tables), output_dir ) # Convert table to XML xml_root, crop_info = self.convert_table_to_xml(table_data, image_filename) # Save XML file if not self.save_xml(xml_root, xml_path): continue # Crop and save image if not self.crop_image(image_path, crop_info, image_output_path): continue print(f" āœ… Table {table_index + 1} completed:") print(f" šŸ“„ XML: {xml_path}") print(f" šŸ–¼ļø Image: {image_output_path}") print(f" šŸ“ Padding: {crop_info['padding']}px ({self.padding_ratio:.1%})") successful_count += 1 except Exception as e: print(f" āŒ Error processing table {table_index + 1}: {e}") continue print(f"āœ… Successfully processed {successful_count}/{len(tables)} tables from {json_path}") return successful_count except Exception as e: print(f"āŒ Error processing file {json_path}: {e}") return 0 def process_batch(self, input_dir: str, output_dir: str = "output") -> int: """ Batch process all JSON+PNG pairs in a directory Args: input_dir: Directory containing JSON and PNG files output_dir: Directory for output files Returns: Total number of tables processed across all files """ try: # Find all JSON files json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')] if not json_files: print(f"āŒ No JSON files found in {input_dir}") return 0 print(f"šŸ—‚ļø Found {len(json_files)} JSON files to process") total_tables = 0 files_processed = 0 for json_file in json_files: # Look for corresponding PNG file base_name = os.path.splitext(json_file)[0] png_file = f"{base_name}.png" json_path = os.path.join(input_dir, json_file) png_path = os.path.join(input_dir, png_file) if os.path.exists(png_path): print(f"\nšŸ“‹ Processing file pair: {base_name}") tables_count = self.process_single_file(json_path, png_path, output_dir) if tables_count > 0: total_tables += tables_count files_processed += 1 else: print(f"āš ļø Warning: No corresponding PNG file found for {json_file}") print(f"\nšŸŽ‰ Batch processing completed!") print(f" šŸ“ Files processed: {files_processed}/{len(json_files)}") print(f" šŸ“Š Total tables processed: {total_tables}") return total_tables except Exception as e: print(f"āŒ Error in batch processing: {e}") return 0 def main(): """Main function with usage examples""" # Create processor instance processor = TableProcessor(padding_ratio=0.02) # 5% padding print("šŸ”§ JSON Table to XML Converter") print("=" * 50) # Example usage print("\nšŸ“– Usage Examples:") print("1. Single file (all tables):") print(" processor.process_single_file('page1.json', 'page1.png', 'output')") print("\n2. Batch processing (all files, all tables):") print(" processor.process_batch('input_folder', 'output_folder')") print("\n3. Custom padding:") print(" processor = TableProcessor(padding_ratio=0.08) # 8% padding") processor.process_batch('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925', 'output_folder') # processor.process_single_file('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).json', '/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).png', 'output') if __name__ == "__main__": main()