|
|
|
|
|
""" |
|
|
JSON Table to XML Converter |
|
|
Processes JSON files containing table data and corresponding PNG images |
|
|
to create cropped sub-table images and XML coordinate files for ALL tables found. |
|
|
""" |
|
|
|
|
|
import json |
|
|
import xml.etree.ElementTree as ET |
|
|
from xml.dom import minidom |
|
|
import os |
|
|
from typing import Dict, List, Tuple, Any, Optional |
|
|
|
|
|
class TableProcessor: |
|
|
"""Main class for processing table data from JSON to XML with image cropping""" |
|
|
|
|
|
def __init__(self, padding_ratio: float = 0.05): |
|
|
""" |
|
|
Initialize the table processor |
|
|
|
|
|
Args: |
|
|
padding_ratio: Padding around table as ratio of min(width, height) |
|
|
""" |
|
|
self.padding_ratio = padding_ratio |
|
|
self.DEFAULT_WIDTH = 100 |
|
|
self.DEFAULT_HEIGHT = 30 |
|
|
|
|
|
def extract_tables_from_json(self, json_data: Any) -> List[Dict]: |
|
|
""" |
|
|
Extract all table items from JSON data |
|
|
|
|
|
Args: |
|
|
json_data: Parsed JSON data (dict or list) |
|
|
|
|
|
Returns: |
|
|
List of table dictionaries |
|
|
""" |
|
|
if isinstance(json_data, list): |
|
|
|
|
|
tables = [item for item in json_data if item.get("type") == "table"] |
|
|
elif isinstance(json_data, dict) and json_data.get("type") == "table": |
|
|
|
|
|
tables = [json_data] |
|
|
else: |
|
|
tables = [] |
|
|
|
|
|
return tables |
|
|
|
|
|
def calculate_cell_coordinates(self, table_properties: Dict, table_x: float, table_y: float) -> Dict[Tuple[int, int], Dict]: |
|
|
""" |
|
|
Calculate coordinates for all visible cells in the table |
|
|
|
|
|
Args: |
|
|
table_properties: Table properties from JSON |
|
|
table_x: Table X position in original image |
|
|
table_y: Table Y position in original image |
|
|
|
|
|
Returns: |
|
|
Dictionary mapping (row, col) to coordinate info |
|
|
""" |
|
|
rows = table_properties.get("rows", 0) |
|
|
columns = table_properties.get("columns", 0) |
|
|
column_widths = table_properties.get("columnWidths", {}) |
|
|
row_heights = table_properties.get("rowHeights", {}) |
|
|
merged_cells = table_properties.get("mergedCells", {}) |
|
|
hidden_cells = table_properties.get("hiddenCells", {}) |
|
|
|
|
|
def get_col_width(col: int) -> int: |
|
|
return column_widths.get(str(col), self.DEFAULT_WIDTH) |
|
|
|
|
|
def get_row_height(row: int) -> int: |
|
|
return row_heights.get(str(row), self.DEFAULT_HEIGHT) |
|
|
|
|
|
|
|
|
merged_spanned_cells = set() |
|
|
for cell_key, merge_info in merged_cells.items(): |
|
|
base_row, base_col = map(int, cell_key.split('-')) |
|
|
rowspan = merge_info.get('rowspan', 1) |
|
|
colspan = merge_info.get('colspan', 1) |
|
|
|
|
|
|
|
|
for r in range(base_row, base_row + rowspan): |
|
|
for c in range(base_col, base_col + colspan): |
|
|
if (r, c) != (base_row, base_col): |
|
|
merged_spanned_cells.add((r, c)) |
|
|
|
|
|
cell_coords = {} |
|
|
|
|
|
for row in range(rows): |
|
|
for col in range(columns): |
|
|
cell_key = f"{row}-{col}" |
|
|
|
|
|
|
|
|
if hidden_cells.get(cell_key) or (row, col) in merged_spanned_cells: |
|
|
continue |
|
|
|
|
|
|
|
|
x = sum(get_col_width(c) for c in range(col)) |
|
|
y = sum(get_row_height(r) for r in range(row)) |
|
|
|
|
|
|
|
|
if cell_key in merged_cells: |
|
|
merge_info = merged_cells[cell_key] |
|
|
colspan = merge_info.get("colspan", 1) |
|
|
rowspan = merge_info.get("rowspan", 1) |
|
|
else: |
|
|
colspan = 1 |
|
|
rowspan = 1 |
|
|
|
|
|
|
|
|
width = sum(get_col_width(c) for c in range(col, col + colspan)) |
|
|
height = sum(get_row_height(r) for r in range(row, row + rowspan)) |
|
|
|
|
|
|
|
|
cell_coords[(row, col)] = { |
|
|
"x": (x + table_x), |
|
|
"y": (y + table_y), |
|
|
"width": width, |
|
|
"height": height, |
|
|
"colspan": colspan, |
|
|
"rowspan": rowspan |
|
|
} |
|
|
|
|
|
return cell_coords |
|
|
|
|
|
def determine_cell_borders(self, cell_data: Optional[Dict], table_properties: Dict) -> Tuple[int, int, int, int]: |
|
|
""" |
|
|
Determine border visibility for each side of a cell |
|
|
|
|
|
Args: |
|
|
cell_data: Individual cell data from JSON |
|
|
table_properties: Global table properties |
|
|
|
|
|
Returns: |
|
|
Tuple of (top, bottom, left, right) border flags (0 or 1) |
|
|
""" |
|
|
|
|
|
cell_borders = table_properties.get("cellBorders", {}) |
|
|
has_global_borders = cell_borders.get("all", False) |
|
|
|
|
|
|
|
|
borders = { |
|
|
"top": 1 if has_global_borders else 0, |
|
|
"bottom": 1 if has_global_borders else 0, |
|
|
"left": 1 if has_global_borders else 0, |
|
|
"right": 1 if has_global_borders else 0 |
|
|
} |
|
|
|
|
|
|
|
|
if cell_data and "cellStyle" in cell_data: |
|
|
cell_style = cell_data["cellStyle"] |
|
|
|
|
|
|
|
|
border_mappings = { |
|
|
"borderTopWidth": "top", |
|
|
"borderBottomWidth": "bottom", |
|
|
"borderLeftWidth": "left", |
|
|
"borderRightWidth": "right" |
|
|
} |
|
|
|
|
|
|
|
|
has_custom_borders = any(key in cell_style for key in border_mappings.keys()) |
|
|
|
|
|
if has_custom_borders: |
|
|
|
|
|
for width_key, border_side in border_mappings.items(): |
|
|
if width_key in cell_style: |
|
|
|
|
|
width = cell_style[width_key] |
|
|
has_border = width > 0 |
|
|
|
|
|
|
|
|
style_key = width_key.replace("Width", "Style") |
|
|
if style_key in cell_style: |
|
|
style = cell_style[style_key] |
|
|
if style == "none": |
|
|
has_border = False |
|
|
|
|
|
borders[border_side] = 1 if has_border else 0 |
|
|
|
|
|
return borders["top"], borders["bottom"], borders["left"], borders["right"] |
|
|
|
|
|
def convert_table_to_xml(self, table_data: Dict, output_filename: str) -> Tuple[ET.Element, Dict]: |
|
|
""" |
|
|
Convert a single table to XML format with crop information |
|
|
|
|
|
Args: |
|
|
table_data: Single table data from JSON |
|
|
output_filename: Filename to reference in XML |
|
|
|
|
|
Returns: |
|
|
Tuple of (XML root element, crop info dictionary) |
|
|
""" |
|
|
|
|
|
properties = table_data.get("properties", {}) |
|
|
table_x = table_data.get("x", 0) |
|
|
table_y = table_data.get("y", 0) |
|
|
table_width = table_data.get("width", properties.get("width", 0)) |
|
|
table_height = table_data.get("height", properties.get("height", 0)) |
|
|
|
|
|
|
|
|
min_dimension = min(table_width, table_height) |
|
|
padding = int(min_dimension * self.padding_ratio) |
|
|
|
|
|
|
|
|
crop_x = table_x - padding |
|
|
crop_y = table_y - padding |
|
|
crop_width = table_width + (2 * padding) |
|
|
crop_height = table_height + (2 * padding) |
|
|
|
|
|
|
|
|
root = ET.Element("document", filename=output_filename) |
|
|
table_elem = ET.SubElement(root, "table") |
|
|
|
|
|
|
|
|
table_x_in_crop = padding |
|
|
table_y_in_crop = padding |
|
|
table_coords = f"{table_x_in_crop},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop + table_height} {table_x_in_crop},{table_y_in_crop + table_height}" |
|
|
ET.SubElement(table_elem, "Coords", points=table_coords) |
|
|
|
|
|
|
|
|
cell_coords = self.calculate_cell_coordinates(properties, table_x, table_y) |
|
|
cell_data = properties.get("cellData", {}) |
|
|
merged_cells = properties.get("mergedCells", {}) |
|
|
|
|
|
|
|
|
for (row, col), coords in cell_coords.items(): |
|
|
cell_key = f"{row}-{col}" |
|
|
current_cell_data = cell_data.get(cell_key, {}) |
|
|
|
|
|
|
|
|
end_row = row + coords["rowspan"] - 1 |
|
|
end_col = col + coords["colspan"] - 1 |
|
|
|
|
|
|
|
|
cell_elem = ET.SubElement(table_elem, "cell") |
|
|
cell_elem.set("start-row", str(row)) |
|
|
cell_elem.set("end-row", str(end_row)) |
|
|
cell_elem.set("start-col", str(col)) |
|
|
cell_elem.set("end-col", str(end_col)) |
|
|
|
|
|
|
|
|
original_x1 = int(coords["x"]) |
|
|
original_y1 = int(coords["y"]) |
|
|
original_x2 = int(coords["x"] + coords["width"]) |
|
|
original_y2 = int(coords["y"] + coords["height"]) |
|
|
|
|
|
|
|
|
crop_x1 = original_x1 - int( crop_x) |
|
|
crop_y1 = original_y1 - int( crop_y) |
|
|
crop_x2 = original_x2 - int( crop_x) |
|
|
crop_y2 = original_y2 - int( crop_y) |
|
|
|
|
|
cell_coords_str = f"{crop_x1},{crop_y1} {crop_x2},{crop_y1} {crop_x2},{crop_y2} {crop_x1},{crop_y2}" |
|
|
ET.SubElement(cell_elem, "Coords", points=cell_coords_str) |
|
|
|
|
|
|
|
|
top, bottom, left, right = self.determine_cell_borders(current_cell_data, properties) |
|
|
ET.SubElement(cell_elem, "Lines", |
|
|
top=str(top), |
|
|
bottom=str(bottom), |
|
|
left=str(left), |
|
|
right=str(right)) |
|
|
|
|
|
|
|
|
crop_info = { |
|
|
"crop_x": crop_x, |
|
|
"crop_y": crop_y, |
|
|
"crop_width": crop_width, |
|
|
"crop_height": crop_height, |
|
|
"padding": padding, |
|
|
"table_id": table_data.get("id", "unknown") |
|
|
} |
|
|
|
|
|
return root, crop_info |
|
|
|
|
|
def save_xml(self, xml_root: ET.Element, output_path: str) -> bool: |
|
|
""" |
|
|
Save XML to file with pretty formatting |
|
|
|
|
|
Args: |
|
|
xml_root: XML root element |
|
|
output_path: Path to save XML file |
|
|
|
|
|
Returns: |
|
|
True if successful, False otherwise |
|
|
""" |
|
|
try: |
|
|
|
|
|
rough_string = ET.tostring(xml_root, encoding='unicode') |
|
|
reparsed = minidom.parseString(rough_string) |
|
|
pretty_xml = reparsed.toprettyxml(indent=" ") |
|
|
|
|
|
|
|
|
lines = [line for line in pretty_xml.split('\n') if line.strip()] |
|
|
pretty_xml = '\n'.join(lines) |
|
|
|
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f: |
|
|
f.write(pretty_xml) |
|
|
|
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"β Error saving XML to {output_path}: {e}") |
|
|
return False |
|
|
|
|
|
def crop_image(self, image_path: str, crop_info: Dict, output_path: str) -> bool: |
|
|
""" |
|
|
Crop image based on crop information |
|
|
|
|
|
Args: |
|
|
image_path: Path to original image |
|
|
crop_info: Crop information dictionary |
|
|
output_path: Path to save cropped image |
|
|
|
|
|
Returns: |
|
|
True if successful, False otherwise |
|
|
""" |
|
|
try: |
|
|
from PIL import Image |
|
|
|
|
|
with Image.open(image_path) as img: |
|
|
|
|
|
left = max(0, int(crop_info['crop_x'])) |
|
|
top = max(0, int(crop_info['crop_y'])) |
|
|
right = min(img.width, int(crop_info['crop_x'] + crop_info['crop_width'])) |
|
|
bottom = min(img.height, int(crop_info['crop_y'] + crop_info['crop_height'])) |
|
|
|
|
|
|
|
|
cropped_img = img.crop((left, top, right, bottom)) |
|
|
cropped_img.save(output_path) |
|
|
|
|
|
return True |
|
|
|
|
|
except ImportError: |
|
|
print("β PIL/Pillow not installed. Run: pip install Pillow") |
|
|
return False |
|
|
except Exception as e: |
|
|
print(f"β Error cropping image: {e}") |
|
|
return False |
|
|
|
|
|
def generate_output_filenames(self, base_name: str, table_index: int, table_id: str, total_tables: int, output_dir: str) -> Tuple[str, str, str]: |
|
|
""" |
|
|
Generate appropriate output filenames for XML and image files |
|
|
|
|
|
Args: |
|
|
base_name: Base filename without extension |
|
|
table_index: Index of current table |
|
|
table_id: ID of the table from JSON |
|
|
total_tables: Total number of tables in the file |
|
|
output_dir: Output directory |
|
|
|
|
|
Returns: |
|
|
Tuple of (xml_path, image_path, image_filename_for_xml) |
|
|
""" |
|
|
if total_tables > 1: |
|
|
|
|
|
clean_table_id = table_id.replace('/', '_').replace('\\', '_') |
|
|
xml_filename = f"{base_name}_table_{table_index}_{clean_table_id}.xml" |
|
|
image_filename = f"{base_name}_table_{table_index}_{clean_table_id}.png" |
|
|
else: |
|
|
|
|
|
xml_filename = f"{base_name}.xml" |
|
|
image_filename = f"{base_name}_cropped.png" |
|
|
|
|
|
xml_path = os.path.join(output_dir, xml_filename) |
|
|
image_path = os.path.join(output_dir, image_filename) |
|
|
|
|
|
return xml_path, image_path, image_filename |
|
|
|
|
|
def process_single_file(self, json_path: str, image_path: str, output_dir: str = "output") -> int: |
|
|
""" |
|
|
Process a single JSON+PNG file pair to extract all tables |
|
|
|
|
|
Args: |
|
|
json_path: Path to JSON file |
|
|
image_path: Path to PNG image file |
|
|
output_dir: Directory for output files |
|
|
|
|
|
Returns: |
|
|
Number of tables successfully processed |
|
|
""" |
|
|
try: |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
with open(json_path, 'r', encoding='utf-8') as f: |
|
|
json_data = json.load(f) |
|
|
json_data = json_data.get('items') |
|
|
|
|
|
tables = self.extract_tables_from_json(json_data) |
|
|
|
|
|
if not tables: |
|
|
print(f"β No tables found in {json_path}") |
|
|
return 0 |
|
|
|
|
|
print(f"π Found {len(tables)} table(s) in {json_path}") |
|
|
|
|
|
base_name = os.path.splitext(os.path.basename(json_path))[0] |
|
|
successful_count = 0 |
|
|
|
|
|
|
|
|
for table_index, table_data in enumerate(tables): |
|
|
try: |
|
|
table_id = table_data.get('id', f'table_{table_index}') |
|
|
print(f" π Processing table {table_index + 1}/{len(tables)} (id: {table_id})") |
|
|
|
|
|
|
|
|
xml_path, image_output_path, image_filename = self.generate_output_filenames( |
|
|
base_name, table_index, table_id, len(tables), output_dir |
|
|
) |
|
|
|
|
|
|
|
|
xml_root, crop_info = self.convert_table_to_xml(table_data, image_filename) |
|
|
|
|
|
|
|
|
if not self.save_xml(xml_root, xml_path): |
|
|
continue |
|
|
|
|
|
|
|
|
if not self.crop_image(image_path, crop_info, image_output_path): |
|
|
continue |
|
|
|
|
|
print(f" β
Table {table_index + 1} completed:") |
|
|
print(f" π XML: {xml_path}") |
|
|
print(f" πΌοΈ Image: {image_output_path}") |
|
|
print(f" π Padding: {crop_info['padding']}px ({self.padding_ratio:.1%})") |
|
|
|
|
|
successful_count += 1 |
|
|
|
|
|
except Exception as e: |
|
|
print(f" β Error processing table {table_index + 1}: {e}") |
|
|
continue |
|
|
|
|
|
print(f"β
Successfully processed {successful_count}/{len(tables)} tables from {json_path}") |
|
|
return successful_count |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error processing file {json_path}: {e}") |
|
|
return 0 |
|
|
|
|
|
def process_batch(self, input_dir: str, output_dir: str = "output") -> int: |
|
|
""" |
|
|
Batch process all JSON+PNG pairs in a directory |
|
|
|
|
|
Args: |
|
|
input_dir: Directory containing JSON and PNG files |
|
|
output_dir: Directory for output files |
|
|
|
|
|
Returns: |
|
|
Total number of tables processed across all files |
|
|
""" |
|
|
try: |
|
|
|
|
|
json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')] |
|
|
|
|
|
if not json_files: |
|
|
print(f"β No JSON files found in {input_dir}") |
|
|
return 0 |
|
|
|
|
|
print(f"ποΈ Found {len(json_files)} JSON files to process") |
|
|
|
|
|
total_tables = 0 |
|
|
files_processed = 0 |
|
|
|
|
|
for json_file in json_files: |
|
|
|
|
|
base_name = os.path.splitext(json_file)[0] |
|
|
png_file = f"{base_name}.png" |
|
|
|
|
|
json_path = os.path.join(input_dir, json_file) |
|
|
png_path = os.path.join(input_dir, png_file) |
|
|
|
|
|
if os.path.exists(png_path): |
|
|
print(f"\nπ Processing file pair: {base_name}") |
|
|
tables_count = self.process_single_file(json_path, png_path, output_dir) |
|
|
if tables_count > 0: |
|
|
total_tables += tables_count |
|
|
files_processed += 1 |
|
|
else: |
|
|
print(f"β οΈ Warning: No corresponding PNG file found for {json_file}") |
|
|
|
|
|
print(f"\nπ Batch processing completed!") |
|
|
print(f" π Files processed: {files_processed}/{len(json_files)}") |
|
|
print(f" π Total tables processed: {total_tables}") |
|
|
|
|
|
return total_tables |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error in batch processing: {e}") |
|
|
return 0 |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main function with usage examples""" |
|
|
|
|
|
|
|
|
processor = TableProcessor(padding_ratio=0.02) |
|
|
|
|
|
print("π§ JSON Table to XML Converter") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
print("\nπ Usage Examples:") |
|
|
print("1. Single file (all tables):") |
|
|
print(" processor.process_single_file('page1.json', 'page1.png', 'output')") |
|
|
|
|
|
print("\n2. Batch processing (all files, all tables):") |
|
|
print(" processor.process_batch('input_folder', 'output_folder')") |
|
|
|
|
|
print("\n3. Custom padding:") |
|
|
print(" processor = TableProcessor(padding_ratio=0.08) # 8% padding") |
|
|
processor.process_batch('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925', 'output_folder') |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |