trace_src / crop_tables.py

Upload 7 files

66180d7 verified 4 months ago

21.1 kB

	#!/usr/bin/env python3
	"""
	JSON Table to XML Converter
	Processes JSON files containing table data and corresponding PNG images
	to create cropped sub-table images and XML coordinate files for ALL tables found.
	"""

	import json
	import xml.etree.ElementTree as ET
	from xml.dom import minidom
	import os
	from typing import Dict, List, Tuple, Any, Optional

	class TableProcessor:
	"""Main class for processing table data from JSON to XML with image cropping"""

	def __init__(self, padding_ratio: float = 0.05):
	"""
	Initialize the table processor

	Args:
	padding_ratio: Padding around table as ratio of min(width, height)
	"""
	self.padding_ratio = padding_ratio
	self.DEFAULT_WIDTH = 100
	self.DEFAULT_HEIGHT = 30

	def extract_tables_from_json(self, json_data: Any) -> List[Dict]:
	"""
	Extract all table items from JSON data

	Args:
	json_data: Parsed JSON data (dict or list)

	Returns:
	List of table dictionaries
	"""
	if isinstance(json_data, list):
	# Filter items with type="table"
	tables = [item for item in json_data if item.get("type") == "table"]
	elif isinstance(json_data, dict) and json_data.get("type") == "table":
	# Single table item
	tables = [json_data]
	else:
	tables = []

	return tables

	def calculate_cell_coordinates(self, table_properties: Dict, table_x: float, table_y: float) -> Dict[Tuple[int, int], Dict]:
	"""
	Calculate coordinates for all visible cells in the table

	Args:
	table_properties: Table properties from JSON
	table_x: Table X position in original image
	table_y: Table Y position in original image

	Returns:
	Dictionary mapping (row, col) to coordinate info
	"""
	rows = table_properties.get("rows", 0)
	columns = table_properties.get("columns", 0)
	column_widths = table_properties.get("columnWidths", {})
	row_heights = table_properties.get("rowHeights", {})
	merged_cells = table_properties.get("mergedCells", {})
	hidden_cells = table_properties.get("hiddenCells", {})

	def get_col_width(col: int) -> int:
	return column_widths.get(str(col), self.DEFAULT_WIDTH)

	def get_row_height(row: int) -> int:
	return row_heights.get(str(row), self.DEFAULT_HEIGHT)

	# Build set of cells that are covered by merged cells (excluding origin)
	merged_spanned_cells = set()
	for cell_key, merge_info in merged_cells.items():
	base_row, base_col = map(int, cell_key.split('-'))
	rowspan = merge_info.get('rowspan', 1)
	colspan = merge_info.get('colspan', 1)

	# Add all spanned cells except the origin cell
	for r in range(base_row, base_row + rowspan):
	for c in range(base_col, base_col + colspan):
	if (r, c) != (base_row, base_col):
	merged_spanned_cells.add((r, c))

	cell_coords = {}

	for row in range(rows):
	for col in range(columns):
	cell_key = f"{row}-{col}"

	# Skip hidden cells and cells covered by merges
	if hidden_cells.get(cell_key) or (row, col) in merged_spanned_cells:
	continue

	# Calculate position by summing previous column widths/row heights
	x = sum(get_col_width(c) for c in range(col))
	y = sum(get_row_height(r) for r in range(row))

	# Check if this cell is a merge origin
	if cell_key in merged_cells:
	merge_info = merged_cells[cell_key]
	colspan = merge_info.get("colspan", 1)
	rowspan = merge_info.get("rowspan", 1)
	else:
	colspan = 1
	rowspan = 1

	# Calculate cell dimensions
	width = sum(get_col_width(c) for c in range(col, col + colspan))
	height = sum(get_row_height(r) for r in range(row, row + rowspan))

	# Store coordinates (with 2x scaling factor from original code)
	cell_coords[(row, col)] = {
	"x": (x + table_x),
	"y": (y + table_y),
	"width": width,
	"height": height,
	"colspan": colspan,
	"rowspan": rowspan
	}

	return cell_coords

	def determine_cell_borders(self, cell_data: Optional[Dict], table_properties: Dict) -> Tuple[int, int, int, int]:
	"""
	Determine border visibility for each side of a cell

	Args:
	cell_data: Individual cell data from JSON
	table_properties: Global table properties

	Returns:
	Tuple of (top, bottom, left, right) border flags (0 or 1)
	"""
	# Get global border settings
	cell_borders = table_properties.get("cellBorders", {})
	has_global_borders = cell_borders.get("all", False)

	# Default borders based on global setting
	borders = {
	"top": 1 if has_global_borders else 0,
	"bottom": 1 if has_global_borders else 0,
	"left": 1 if has_global_borders else 0,
	"right": 1 if has_global_borders else 0
	}

	# Check for cell-specific border overrides
	if cell_data and "cellStyle" in cell_data:
	cell_style = cell_data["cellStyle"]

	# Border property mappings
	border_mappings = {
	"borderTopWidth": "top",
	"borderBottomWidth": "bottom",
	"borderLeftWidth": "left",
	"borderRightWidth": "right"
	}

	# If any border width property exists, this cell has custom borders
	has_custom_borders = any(key in cell_style for key in border_mappings.keys())

	if has_custom_borders:
	# Apply custom border settings for each side
	for width_key, border_side in border_mappings.items():
	if width_key in cell_style:
	# Check border width
	width = cell_style[width_key]
	has_border = width > 0

	# Check border style if specified
	style_key = width_key.replace("Width", "Style")
	if style_key in cell_style:
	style = cell_style[style_key]
	if style == "none":
	has_border = False

	borders[border_side] = 1 if has_border else 0

	return borders["top"], borders["bottom"], borders["left"], borders["right"]

	def convert_table_to_xml(self, table_data: Dict, output_filename: str) -> Tuple[ET.Element, Dict]:
	"""
	Convert a single table to XML format with crop information

	Args:
	table_data: Single table data from JSON
	output_filename: Filename to reference in XML

	Returns:
	Tuple of (XML root element, crop info dictionary)
	"""
	# Extract table properties
	properties = table_data.get("properties", {})
	table_x = table_data.get("x", 0)
	table_y = table_data.get("y", 0)
	table_width = table_data.get("width", properties.get("width", 0))
	table_height = table_data.get("height", properties.get("height", 0))

	# Calculate padding based on table dimensions
	min_dimension = min(table_width, table_height)
	padding = int(min_dimension * self.padding_ratio)

	# Calculate crop area
	crop_x = table_x - padding
	crop_y = table_y - padding
	crop_width = table_width + (2 * padding)
	crop_height = table_height + (2 * padding)

	# Create XML structure
	root = ET.Element("document", filename=output_filename)
	table_elem = ET.SubElement(root, "table")

	# Add table coordinates relative to cropped image
	table_x_in_crop = padding
	table_y_in_crop = padding
	table_coords = f"{table_x_in_crop},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop + table_height} {table_x_in_crop},{table_y_in_crop + table_height}"
	ET.SubElement(table_elem, "Coords", points=table_coords)

	# Get cell coordinates and data
	cell_coords = self.calculate_cell_coordinates(properties, table_x, table_y)
	cell_data = properties.get("cellData", {})
	merged_cells = properties.get("mergedCells", {})

	# Create XML elements for each cell
	for (row, col), coords in cell_coords.items():
	cell_key = f"{row}-{col}"
	current_cell_data = cell_data.get(cell_key, {})

	# Determine cell span (for merged cells)
	end_row = row + coords["rowspan"] - 1
	end_col = col + coords["colspan"] - 1

	# Create cell element
	cell_elem = ET.SubElement(table_elem, "cell")
	cell_elem.set("start-row", str(row))
	cell_elem.set("end-row", str(end_row))
	cell_elem.set("start-col", str(col))
	cell_elem.set("end-col", str(end_col))

	# Convert coordinates to cropped image space
	original_x1 = int(coords["x"])
	original_y1 = int(coords["y"])
	original_x2 = int(coords["x"] + coords["width"])
	original_y2 = int(coords["y"] + coords["height"])

	# Transform to cropped coordinates
	crop_x1 = original_x1 - int( crop_x)
	crop_y1 = original_y1 - int( crop_y)
	crop_x2 = original_x2 - int( crop_x)
	crop_y2 = original_y2 - int( crop_y)

	cell_coords_str = f"{crop_x1},{crop_y1} {crop_x2},{crop_y1} {crop_x2},{crop_y2} {crop_x1},{crop_y2}"
	ET.SubElement(cell_elem, "Coords", points=cell_coords_str)

	# Add border information
	top, bottom, left, right = self.determine_cell_borders(current_cell_data, properties)
	ET.SubElement(cell_elem, "Lines",
	top=str(top),
	bottom=str(bottom),
	left=str(left),
	right=str(right))

	# Prepare crop information
	crop_info = {
	"crop_x": crop_x,
	"crop_y": crop_y,
	"crop_width": crop_width,
	"crop_height": crop_height,
	"padding": padding,
	"table_id": table_data.get("id", "unknown")
	}

	return root, crop_info

	def save_xml(self, xml_root: ET.Element, output_path: str) -> bool:
	"""
	Save XML to file with pretty formatting

	Args:
	xml_root: XML root element
	output_path: Path to save XML file

	Returns:
	True if successful, False otherwise
	"""
	try:
	# Convert to pretty-formatted string
	rough_string = ET.tostring(xml_root, encoding='unicode')
	reparsed = minidom.parseString(rough_string)
	pretty_xml = reparsed.toprettyxml(indent=" ")

	# Clean up extra whitespace lines
	lines = [line for line in pretty_xml.split('\n') if line.strip()]
	pretty_xml = '\n'.join(lines)

	# Write to file
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(pretty_xml)

	return True
	except Exception as e:
	print(f"❌ Error saving XML to {output_path}: {e}")
	return False

	def crop_image(self, image_path: str, crop_info: Dict, output_path: str) -> bool:
	"""
	Crop image based on crop information

	Args:
	image_path: Path to original image
	crop_info: Crop information dictionary
	output_path: Path to save cropped image

	Returns:
	True if successful, False otherwise
	"""
	try:
	from PIL import Image

	with Image.open(image_path) as img:
	# Ensure crop coordinates are within image bounds
	left = max(0, int(crop_info['crop_x']))
	top = max(0, int(crop_info['crop_y']))
	right = min(img.width, int(crop_info['crop_x'] + crop_info['crop_width']))
	bottom = min(img.height, int(crop_info['crop_y'] + crop_info['crop_height']))

	# Crop and save
	cropped_img = img.crop((left, top, right, bottom))
	cropped_img.save(output_path)

	return True

	except ImportError:
	print("❌ PIL/Pillow not installed. Run: pip install Pillow")
	return False
	except Exception as e:
	print(f"❌ Error cropping image: {e}")
	return False

	def generate_output_filenames(self, base_name: str, table_index: int, table_id: str, total_tables: int, output_dir: str) -> Tuple[str, str, str]:
	"""
	Generate appropriate output filenames for XML and image files

	Args:
	base_name: Base filename without extension
	table_index: Index of current table
	table_id: ID of the table from JSON
	total_tables: Total number of tables in the file
	output_dir: Output directory

	Returns:
	Tuple of (xml_path, image_path, image_filename_for_xml)
	"""
	if total_tables > 1:
	# Multiple tables: add index and ID to filename
	clean_table_id = table_id.replace('/', '_').replace('\\', '_') # Clean ID for filename
	xml_filename = f"{base_name}_table_{table_index}_{clean_table_id}.xml"
	image_filename = f"{base_name}_table_{table_index}_{clean_table_id}.png"
	else:
	# Single table: use simple filename
	xml_filename = f"{base_name}.xml"
	image_filename = f"{base_name}_cropped.png"

	xml_path = os.path.join(output_dir, xml_filename)
	image_path = os.path.join(output_dir, image_filename)

	return xml_path, image_path, image_filename

	def process_single_file(self, json_path: str, image_path: str, output_dir: str = "output") -> int:
	"""
	Process a single JSON+PNG file pair to extract all tables

	Args:
	json_path: Path to JSON file
	image_path: Path to PNG image file
	output_dir: Directory for output files

	Returns:
	Number of tables successfully processed
	"""
	try:
	# Create output directory
	os.makedirs(output_dir, exist_ok=True)

	# Read and parse JSON
	with open(json_path, 'r', encoding='utf-8') as f:
	json_data = json.load(f)
	json_data = json_data.get('items')
	# Extract all tables
	tables = self.extract_tables_from_json(json_data)

	if not tables:
	print(f"❌ No tables found in {json_path}")
	return 0

	print(f"📋 Found {len(tables)} table(s) in {json_path}")

	base_name = os.path.splitext(os.path.basename(json_path))[0]
	successful_count = 0

	# Process each table
	for table_index, table_data in enumerate(tables):
	try:
	table_id = table_data.get('id', f'table_{table_index}')
	print(f" 🔄 Processing table {table_index + 1}/{len(tables)} (id: {table_id})")

	# Generate filenames
	xml_path, image_output_path, image_filename = self.generate_output_filenames(
	base_name, table_index, table_id, len(tables), output_dir
	)

	# Convert table to XML
	xml_root, crop_info = self.convert_table_to_xml(table_data, image_filename)

	# Save XML file
	if not self.save_xml(xml_root, xml_path):
	continue

	# Crop and save image
	if not self.crop_image(image_path, crop_info, image_output_path):
	continue

	print(f" ✅ Table {table_index + 1} completed:")
	print(f" 📄 XML: {xml_path}")
	print(f" 🖼️ Image: {image_output_path}")
	print(f" 📏 Padding: {crop_info['padding']}px ({self.padding_ratio:.1%})")

	successful_count += 1

	except Exception as e:
	print(f" ❌ Error processing table {table_index + 1}: {e}")
	continue

	print(f"✅ Successfully processed {successful_count}/{len(tables)} tables from {json_path}")
	return successful_count

	except Exception as e:
	print(f"❌ Error processing file {json_path}: {e}")
	return 0

	def process_batch(self, input_dir: str, output_dir: str = "output") -> int:
	"""
	Batch process all JSON+PNG pairs in a directory

	Args:
	input_dir: Directory containing JSON and PNG files
	output_dir: Directory for output files

	Returns:
	Total number of tables processed across all files
	"""
	try:
	# Find all JSON files
	json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]

	if not json_files:
	print(f"❌ No JSON files found in {input_dir}")
	return 0

	print(f"🗂️ Found {len(json_files)} JSON files to process")

	total_tables = 0
	files_processed = 0

	for json_file in json_files:
	# Look for corresponding PNG file
	base_name = os.path.splitext(json_file)[0]
	png_file = f"{base_name}.png"

	json_path = os.path.join(input_dir, json_file)
	png_path = os.path.join(input_dir, png_file)

	if os.path.exists(png_path):
	print(f"\n📋 Processing file pair: {base_name}")
	tables_count = self.process_single_file(json_path, png_path, output_dir)
	if tables_count > 0:
	total_tables += tables_count
	files_processed += 1
	else:
	print(f"⚠️ Warning: No corresponding PNG file found for {json_file}")

	print(f"\n🎉 Batch processing completed!")
	print(f" 📁 Files processed: {files_processed}/{len(json_files)}")
	print(f" 📊 Total tables processed: {total_tables}")

	return total_tables

	except Exception as e:
	print(f"❌ Error in batch processing: {e}")
	return 0


	def main():
	"""Main function with usage examples"""

	# Create processor instance
	processor = TableProcessor(padding_ratio=0.02) # 5% padding

	print("🔧 JSON Table to XML Converter")
	print("=" * 50)

	# Example usage
	print("\n📖 Usage Examples:")
	print("1. Single file (all tables):")
	print(" processor.process_single_file('page1.json', 'page1.png', 'output')")

	print("\n2. Batch processing (all files, all tables):")
	print(" processor.process_batch('input_folder', 'output_folder')")

	print("\n3. Custom padding:")
	print(" processor = TableProcessor(padding_ratio=0.08) # 8% padding")
	processor.process_batch('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925', 'output_folder')
	# processor.process_single_file('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).json', '/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).png', 'output')
	if __name__ == "__main__":
	main()