Upload 7 files
Browse files- crop_tables.py +517 -0
- delete_name.py +12 -0
- json2txt.py +31 -0
- json2xml.py +243 -0
- rename.py +10 -0
- resize.py +26 -0
- vis_json_cell.py +103 -0
crop_tables.py
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
JSON Table to XML Converter
|
| 4 |
+
Processes JSON files containing table data and corresponding PNG images
|
| 5 |
+
to create cropped sub-table images and XML coordinate files for ALL tables found.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import xml.etree.ElementTree as ET
|
| 10 |
+
from xml.dom import minidom
|
| 11 |
+
import os
|
| 12 |
+
from typing import Dict, List, Tuple, Any, Optional
|
| 13 |
+
|
| 14 |
+
class TableProcessor:
|
| 15 |
+
"""Main class for processing table data from JSON to XML with image cropping"""
|
| 16 |
+
|
| 17 |
+
def __init__(self, padding_ratio: float = 0.05):
|
| 18 |
+
"""
|
| 19 |
+
Initialize the table processor
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
padding_ratio: Padding around table as ratio of min(width, height)
|
| 23 |
+
"""
|
| 24 |
+
self.padding_ratio = padding_ratio
|
| 25 |
+
self.DEFAULT_WIDTH = 100
|
| 26 |
+
self.DEFAULT_HEIGHT = 30
|
| 27 |
+
|
| 28 |
+
def extract_tables_from_json(self, json_data: Any) -> List[Dict]:
|
| 29 |
+
"""
|
| 30 |
+
Extract all table items from JSON data
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
json_data: Parsed JSON data (dict or list)
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
List of table dictionaries
|
| 37 |
+
"""
|
| 38 |
+
if isinstance(json_data, list):
|
| 39 |
+
# Filter items with type="table"
|
| 40 |
+
tables = [item for item in json_data if item.get("type") == "table"]
|
| 41 |
+
elif isinstance(json_data, dict) and json_data.get("type") == "table":
|
| 42 |
+
# Single table item
|
| 43 |
+
tables = [json_data]
|
| 44 |
+
else:
|
| 45 |
+
tables = []
|
| 46 |
+
|
| 47 |
+
return tables
|
| 48 |
+
|
| 49 |
+
def calculate_cell_coordinates(self, table_properties: Dict, table_x: float, table_y: float) -> Dict[Tuple[int, int], Dict]:
|
| 50 |
+
"""
|
| 51 |
+
Calculate coordinates for all visible cells in the table
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
table_properties: Table properties from JSON
|
| 55 |
+
table_x: Table X position in original image
|
| 56 |
+
table_y: Table Y position in original image
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dictionary mapping (row, col) to coordinate info
|
| 60 |
+
"""
|
| 61 |
+
rows = table_properties.get("rows", 0)
|
| 62 |
+
columns = table_properties.get("columns", 0)
|
| 63 |
+
column_widths = table_properties.get("columnWidths", {})
|
| 64 |
+
row_heights = table_properties.get("rowHeights", {})
|
| 65 |
+
merged_cells = table_properties.get("mergedCells", {})
|
| 66 |
+
hidden_cells = table_properties.get("hiddenCells", {})
|
| 67 |
+
|
| 68 |
+
def get_col_width(col: int) -> int:
|
| 69 |
+
return column_widths.get(str(col), self.DEFAULT_WIDTH)
|
| 70 |
+
|
| 71 |
+
def get_row_height(row: int) -> int:
|
| 72 |
+
return row_heights.get(str(row), self.DEFAULT_HEIGHT)
|
| 73 |
+
|
| 74 |
+
# Build set of cells that are covered by merged cells (excluding origin)
|
| 75 |
+
merged_spanned_cells = set()
|
| 76 |
+
for cell_key, merge_info in merged_cells.items():
|
| 77 |
+
base_row, base_col = map(int, cell_key.split('-'))
|
| 78 |
+
rowspan = merge_info.get('rowspan', 1)
|
| 79 |
+
colspan = merge_info.get('colspan', 1)
|
| 80 |
+
|
| 81 |
+
# Add all spanned cells except the origin cell
|
| 82 |
+
for r in range(base_row, base_row + rowspan):
|
| 83 |
+
for c in range(base_col, base_col + colspan):
|
| 84 |
+
if (r, c) != (base_row, base_col):
|
| 85 |
+
merged_spanned_cells.add((r, c))
|
| 86 |
+
|
| 87 |
+
cell_coords = {}
|
| 88 |
+
|
| 89 |
+
for row in range(rows):
|
| 90 |
+
for col in range(columns):
|
| 91 |
+
cell_key = f"{row}-{col}"
|
| 92 |
+
|
| 93 |
+
# Skip hidden cells and cells covered by merges
|
| 94 |
+
if hidden_cells.get(cell_key) or (row, col) in merged_spanned_cells:
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
# Calculate position by summing previous column widths/row heights
|
| 98 |
+
x = sum(get_col_width(c) for c in range(col))
|
| 99 |
+
y = sum(get_row_height(r) for r in range(row))
|
| 100 |
+
|
| 101 |
+
# Check if this cell is a merge origin
|
| 102 |
+
if cell_key in merged_cells:
|
| 103 |
+
merge_info = merged_cells[cell_key]
|
| 104 |
+
colspan = merge_info.get("colspan", 1)
|
| 105 |
+
rowspan = merge_info.get("rowspan", 1)
|
| 106 |
+
else:
|
| 107 |
+
colspan = 1
|
| 108 |
+
rowspan = 1
|
| 109 |
+
|
| 110 |
+
# Calculate cell dimensions
|
| 111 |
+
width = sum(get_col_width(c) for c in range(col, col + colspan))
|
| 112 |
+
height = sum(get_row_height(r) for r in range(row, row + rowspan))
|
| 113 |
+
|
| 114 |
+
# Store coordinates (with 2x scaling factor from original code)
|
| 115 |
+
cell_coords[(row, col)] = {
|
| 116 |
+
"x": (x + table_x),
|
| 117 |
+
"y": (y + table_y),
|
| 118 |
+
"width": width,
|
| 119 |
+
"height": height,
|
| 120 |
+
"colspan": colspan,
|
| 121 |
+
"rowspan": rowspan
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
return cell_coords
|
| 125 |
+
|
| 126 |
+
def determine_cell_borders(self, cell_data: Optional[Dict], table_properties: Dict) -> Tuple[int, int, int, int]:
|
| 127 |
+
"""
|
| 128 |
+
Determine border visibility for each side of a cell
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
cell_data: Individual cell data from JSON
|
| 132 |
+
table_properties: Global table properties
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
Tuple of (top, bottom, left, right) border flags (0 or 1)
|
| 136 |
+
"""
|
| 137 |
+
# Get global border settings
|
| 138 |
+
cell_borders = table_properties.get("cellBorders", {})
|
| 139 |
+
has_global_borders = cell_borders.get("all", False)
|
| 140 |
+
|
| 141 |
+
# Default borders based on global setting
|
| 142 |
+
borders = {
|
| 143 |
+
"top": 1 if has_global_borders else 0,
|
| 144 |
+
"bottom": 1 if has_global_borders else 0,
|
| 145 |
+
"left": 1 if has_global_borders else 0,
|
| 146 |
+
"right": 1 if has_global_borders else 0
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
# Check for cell-specific border overrides
|
| 150 |
+
if cell_data and "cellStyle" in cell_data:
|
| 151 |
+
cell_style = cell_data["cellStyle"]
|
| 152 |
+
|
| 153 |
+
# Border property mappings
|
| 154 |
+
border_mappings = {
|
| 155 |
+
"borderTopWidth": "top",
|
| 156 |
+
"borderBottomWidth": "bottom",
|
| 157 |
+
"borderLeftWidth": "left",
|
| 158 |
+
"borderRightWidth": "right"
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
# If any border width property exists, this cell has custom borders
|
| 162 |
+
has_custom_borders = any(key in cell_style for key in border_mappings.keys())
|
| 163 |
+
|
| 164 |
+
if has_custom_borders:
|
| 165 |
+
# Apply custom border settings for each side
|
| 166 |
+
for width_key, border_side in border_mappings.items():
|
| 167 |
+
if width_key in cell_style:
|
| 168 |
+
# Check border width
|
| 169 |
+
width = cell_style[width_key]
|
| 170 |
+
has_border = width > 0
|
| 171 |
+
|
| 172 |
+
# Check border style if specified
|
| 173 |
+
style_key = width_key.replace("Width", "Style")
|
| 174 |
+
if style_key in cell_style:
|
| 175 |
+
style = cell_style[style_key]
|
| 176 |
+
if style == "none":
|
| 177 |
+
has_border = False
|
| 178 |
+
|
| 179 |
+
borders[border_side] = 1 if has_border else 0
|
| 180 |
+
|
| 181 |
+
return borders["top"], borders["bottom"], borders["left"], borders["right"]
|
| 182 |
+
|
| 183 |
+
def convert_table_to_xml(self, table_data: Dict, output_filename: str) -> Tuple[ET.Element, Dict]:
|
| 184 |
+
"""
|
| 185 |
+
Convert a single table to XML format with crop information
|
| 186 |
+
|
| 187 |
+
Args:
|
| 188 |
+
table_data: Single table data from JSON
|
| 189 |
+
output_filename: Filename to reference in XML
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
Tuple of (XML root element, crop info dictionary)
|
| 193 |
+
"""
|
| 194 |
+
# Extract table properties
|
| 195 |
+
properties = table_data.get("properties", {})
|
| 196 |
+
table_x = table_data.get("x", 0)
|
| 197 |
+
table_y = table_data.get("y", 0)
|
| 198 |
+
table_width = table_data.get("width", properties.get("width", 0))
|
| 199 |
+
table_height = table_data.get("height", properties.get("height", 0))
|
| 200 |
+
|
| 201 |
+
# Calculate padding based on table dimensions
|
| 202 |
+
min_dimension = min(table_width, table_height)
|
| 203 |
+
padding = int(min_dimension * self.padding_ratio)
|
| 204 |
+
|
| 205 |
+
# Calculate crop area
|
| 206 |
+
crop_x = table_x - padding
|
| 207 |
+
crop_y = table_y - padding
|
| 208 |
+
crop_width = table_width + (2 * padding)
|
| 209 |
+
crop_height = table_height + (2 * padding)
|
| 210 |
+
|
| 211 |
+
# Create XML structure
|
| 212 |
+
root = ET.Element("document", filename=output_filename)
|
| 213 |
+
table_elem = ET.SubElement(root, "table")
|
| 214 |
+
|
| 215 |
+
# Add table coordinates relative to cropped image
|
| 216 |
+
table_x_in_crop = padding
|
| 217 |
+
table_y_in_crop = padding
|
| 218 |
+
table_coords = f"{table_x_in_crop},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop + table_height} {table_x_in_crop},{table_y_in_crop + table_height}"
|
| 219 |
+
ET.SubElement(table_elem, "Coords", points=table_coords)
|
| 220 |
+
|
| 221 |
+
# Get cell coordinates and data
|
| 222 |
+
cell_coords = self.calculate_cell_coordinates(properties, table_x, table_y)
|
| 223 |
+
cell_data = properties.get("cellData", {})
|
| 224 |
+
merged_cells = properties.get("mergedCells", {})
|
| 225 |
+
|
| 226 |
+
# Create XML elements for each cell
|
| 227 |
+
for (row, col), coords in cell_coords.items():
|
| 228 |
+
cell_key = f"{row}-{col}"
|
| 229 |
+
current_cell_data = cell_data.get(cell_key, {})
|
| 230 |
+
|
| 231 |
+
# Determine cell span (for merged cells)
|
| 232 |
+
end_row = row + coords["rowspan"] - 1
|
| 233 |
+
end_col = col + coords["colspan"] - 1
|
| 234 |
+
|
| 235 |
+
# Create cell element
|
| 236 |
+
cell_elem = ET.SubElement(table_elem, "cell")
|
| 237 |
+
cell_elem.set("start-row", str(row))
|
| 238 |
+
cell_elem.set("end-row", str(end_row))
|
| 239 |
+
cell_elem.set("start-col", str(col))
|
| 240 |
+
cell_elem.set("end-col", str(end_col))
|
| 241 |
+
|
| 242 |
+
# Convert coordinates to cropped image space
|
| 243 |
+
original_x1 = int(coords["x"])
|
| 244 |
+
original_y1 = int(coords["y"])
|
| 245 |
+
original_x2 = int(coords["x"] + coords["width"])
|
| 246 |
+
original_y2 = int(coords["y"] + coords["height"])
|
| 247 |
+
|
| 248 |
+
# Transform to cropped coordinates
|
| 249 |
+
crop_x1 = original_x1 - int( crop_x)
|
| 250 |
+
crop_y1 = original_y1 - int( crop_y)
|
| 251 |
+
crop_x2 = original_x2 - int( crop_x)
|
| 252 |
+
crop_y2 = original_y2 - int( crop_y)
|
| 253 |
+
|
| 254 |
+
cell_coords_str = f"{crop_x1},{crop_y1} {crop_x2},{crop_y1} {crop_x2},{crop_y2} {crop_x1},{crop_y2}"
|
| 255 |
+
ET.SubElement(cell_elem, "Coords", points=cell_coords_str)
|
| 256 |
+
|
| 257 |
+
# Add border information
|
| 258 |
+
top, bottom, left, right = self.determine_cell_borders(current_cell_data, properties)
|
| 259 |
+
ET.SubElement(cell_elem, "Lines",
|
| 260 |
+
top=str(top),
|
| 261 |
+
bottom=str(bottom),
|
| 262 |
+
left=str(left),
|
| 263 |
+
right=str(right))
|
| 264 |
+
|
| 265 |
+
# Prepare crop information
|
| 266 |
+
crop_info = {
|
| 267 |
+
"crop_x": crop_x,
|
| 268 |
+
"crop_y": crop_y,
|
| 269 |
+
"crop_width": crop_width,
|
| 270 |
+
"crop_height": crop_height,
|
| 271 |
+
"padding": padding,
|
| 272 |
+
"table_id": table_data.get("id", "unknown")
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
return root, crop_info
|
| 276 |
+
|
| 277 |
+
def save_xml(self, xml_root: ET.Element, output_path: str) -> bool:
|
| 278 |
+
"""
|
| 279 |
+
Save XML to file with pretty formatting
|
| 280 |
+
|
| 281 |
+
Args:
|
| 282 |
+
xml_root: XML root element
|
| 283 |
+
output_path: Path to save XML file
|
| 284 |
+
|
| 285 |
+
Returns:
|
| 286 |
+
True if successful, False otherwise
|
| 287 |
+
"""
|
| 288 |
+
try:
|
| 289 |
+
# Convert to pretty-formatted string
|
| 290 |
+
rough_string = ET.tostring(xml_root, encoding='unicode')
|
| 291 |
+
reparsed = minidom.parseString(rough_string)
|
| 292 |
+
pretty_xml = reparsed.toprettyxml(indent=" ")
|
| 293 |
+
|
| 294 |
+
# Clean up extra whitespace lines
|
| 295 |
+
lines = [line for line in pretty_xml.split('\n') if line.strip()]
|
| 296 |
+
pretty_xml = '\n'.join(lines)
|
| 297 |
+
|
| 298 |
+
# Write to file
|
| 299 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 300 |
+
f.write(pretty_xml)
|
| 301 |
+
|
| 302 |
+
return True
|
| 303 |
+
except Exception as e:
|
| 304 |
+
print(f"❌ Error saving XML to {output_path}: {e}")
|
| 305 |
+
return False
|
| 306 |
+
|
| 307 |
+
def crop_image(self, image_path: str, crop_info: Dict, output_path: str) -> bool:
|
| 308 |
+
"""
|
| 309 |
+
Crop image based on crop information
|
| 310 |
+
|
| 311 |
+
Args:
|
| 312 |
+
image_path: Path to original image
|
| 313 |
+
crop_info: Crop information dictionary
|
| 314 |
+
output_path: Path to save cropped image
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
True if successful, False otherwise
|
| 318 |
+
"""
|
| 319 |
+
try:
|
| 320 |
+
from PIL import Image
|
| 321 |
+
|
| 322 |
+
with Image.open(image_path) as img:
|
| 323 |
+
# Ensure crop coordinates are within image bounds
|
| 324 |
+
left = max(0, int(crop_info['crop_x']))
|
| 325 |
+
top = max(0, int(crop_info['crop_y']))
|
| 326 |
+
right = min(img.width, int(crop_info['crop_x'] + crop_info['crop_width']))
|
| 327 |
+
bottom = min(img.height, int(crop_info['crop_y'] + crop_info['crop_height']))
|
| 328 |
+
|
| 329 |
+
# Crop and save
|
| 330 |
+
cropped_img = img.crop((left, top, right, bottom))
|
| 331 |
+
cropped_img.save(output_path)
|
| 332 |
+
|
| 333 |
+
return True
|
| 334 |
+
|
| 335 |
+
except ImportError:
|
| 336 |
+
print("❌ PIL/Pillow not installed. Run: pip install Pillow")
|
| 337 |
+
return False
|
| 338 |
+
except Exception as e:
|
| 339 |
+
print(f"❌ Error cropping image: {e}")
|
| 340 |
+
return False
|
| 341 |
+
|
| 342 |
+
def generate_output_filenames(self, base_name: str, table_index: int, table_id: str, total_tables: int, output_dir: str) -> Tuple[str, str, str]:
|
| 343 |
+
"""
|
| 344 |
+
Generate appropriate output filenames for XML and image files
|
| 345 |
+
|
| 346 |
+
Args:
|
| 347 |
+
base_name: Base filename without extension
|
| 348 |
+
table_index: Index of current table
|
| 349 |
+
table_id: ID of the table from JSON
|
| 350 |
+
total_tables: Total number of tables in the file
|
| 351 |
+
output_dir: Output directory
|
| 352 |
+
|
| 353 |
+
Returns:
|
| 354 |
+
Tuple of (xml_path, image_path, image_filename_for_xml)
|
| 355 |
+
"""
|
| 356 |
+
if total_tables > 1:
|
| 357 |
+
# Multiple tables: add index and ID to filename
|
| 358 |
+
clean_table_id = table_id.replace('/', '_').replace('\\', '_') # Clean ID for filename
|
| 359 |
+
xml_filename = f"{base_name}_table_{table_index}_{clean_table_id}.xml"
|
| 360 |
+
image_filename = f"{base_name}_table_{table_index}_{clean_table_id}.png"
|
| 361 |
+
else:
|
| 362 |
+
# Single table: use simple filename
|
| 363 |
+
xml_filename = f"{base_name}.xml"
|
| 364 |
+
image_filename = f"{base_name}_cropped.png"
|
| 365 |
+
|
| 366 |
+
xml_path = os.path.join(output_dir, xml_filename)
|
| 367 |
+
image_path = os.path.join(output_dir, image_filename)
|
| 368 |
+
|
| 369 |
+
return xml_path, image_path, image_filename
|
| 370 |
+
|
| 371 |
+
def process_single_file(self, json_path: str, image_path: str, output_dir: str = "output") -> int:
|
| 372 |
+
"""
|
| 373 |
+
Process a single JSON+PNG file pair to extract all tables
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
json_path: Path to JSON file
|
| 377 |
+
image_path: Path to PNG image file
|
| 378 |
+
output_dir: Directory for output files
|
| 379 |
+
|
| 380 |
+
Returns:
|
| 381 |
+
Number of tables successfully processed
|
| 382 |
+
"""
|
| 383 |
+
try:
|
| 384 |
+
# Create output directory
|
| 385 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 386 |
+
|
| 387 |
+
# Read and parse JSON
|
| 388 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
| 389 |
+
json_data = json.load(f)
|
| 390 |
+
json_data = json_data.get('items')
|
| 391 |
+
# Extract all tables
|
| 392 |
+
tables = self.extract_tables_from_json(json_data)
|
| 393 |
+
|
| 394 |
+
if not tables:
|
| 395 |
+
print(f"❌ No tables found in {json_path}")
|
| 396 |
+
return 0
|
| 397 |
+
|
| 398 |
+
print(f"📋 Found {len(tables)} table(s) in {json_path}")
|
| 399 |
+
|
| 400 |
+
base_name = os.path.splitext(os.path.basename(json_path))[0]
|
| 401 |
+
successful_count = 0
|
| 402 |
+
|
| 403 |
+
# Process each table
|
| 404 |
+
for table_index, table_data in enumerate(tables):
|
| 405 |
+
try:
|
| 406 |
+
table_id = table_data.get('id', f'table_{table_index}')
|
| 407 |
+
print(f" 🔄 Processing table {table_index + 1}/{len(tables)} (id: {table_id})")
|
| 408 |
+
|
| 409 |
+
# Generate filenames
|
| 410 |
+
xml_path, image_output_path, image_filename = self.generate_output_filenames(
|
| 411 |
+
base_name, table_index, table_id, len(tables), output_dir
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
# Convert table to XML
|
| 415 |
+
xml_root, crop_info = self.convert_table_to_xml(table_data, image_filename)
|
| 416 |
+
|
| 417 |
+
# Save XML file
|
| 418 |
+
if not self.save_xml(xml_root, xml_path):
|
| 419 |
+
continue
|
| 420 |
+
|
| 421 |
+
# Crop and save image
|
| 422 |
+
if not self.crop_image(image_path, crop_info, image_output_path):
|
| 423 |
+
continue
|
| 424 |
+
|
| 425 |
+
print(f" ✅ Table {table_index + 1} completed:")
|
| 426 |
+
print(f" 📄 XML: {xml_path}")
|
| 427 |
+
print(f" 🖼️ Image: {image_output_path}")
|
| 428 |
+
print(f" 📏 Padding: {crop_info['padding']}px ({self.padding_ratio:.1%})")
|
| 429 |
+
|
| 430 |
+
successful_count += 1
|
| 431 |
+
|
| 432 |
+
except Exception as e:
|
| 433 |
+
print(f" ❌ Error processing table {table_index + 1}: {e}")
|
| 434 |
+
continue
|
| 435 |
+
|
| 436 |
+
print(f"✅ Successfully processed {successful_count}/{len(tables)} tables from {json_path}")
|
| 437 |
+
return successful_count
|
| 438 |
+
|
| 439 |
+
except Exception as e:
|
| 440 |
+
print(f"❌ Error processing file {json_path}: {e}")
|
| 441 |
+
return 0
|
| 442 |
+
|
| 443 |
+
def process_batch(self, input_dir: str, output_dir: str = "output") -> int:
|
| 444 |
+
"""
|
| 445 |
+
Batch process all JSON+PNG pairs in a directory
|
| 446 |
+
|
| 447 |
+
Args:
|
| 448 |
+
input_dir: Directory containing JSON and PNG files
|
| 449 |
+
output_dir: Directory for output files
|
| 450 |
+
|
| 451 |
+
Returns:
|
| 452 |
+
Total number of tables processed across all files
|
| 453 |
+
"""
|
| 454 |
+
try:
|
| 455 |
+
# Find all JSON files
|
| 456 |
+
json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]
|
| 457 |
+
|
| 458 |
+
if not json_files:
|
| 459 |
+
print(f"❌ No JSON files found in {input_dir}")
|
| 460 |
+
return 0
|
| 461 |
+
|
| 462 |
+
print(f"🗂️ Found {len(json_files)} JSON files to process")
|
| 463 |
+
|
| 464 |
+
total_tables = 0
|
| 465 |
+
files_processed = 0
|
| 466 |
+
|
| 467 |
+
for json_file in json_files:
|
| 468 |
+
# Look for corresponding PNG file
|
| 469 |
+
base_name = os.path.splitext(json_file)[0]
|
| 470 |
+
png_file = f"{base_name}.png"
|
| 471 |
+
|
| 472 |
+
json_path = os.path.join(input_dir, json_file)
|
| 473 |
+
png_path = os.path.join(input_dir, png_file)
|
| 474 |
+
|
| 475 |
+
if os.path.exists(png_path):
|
| 476 |
+
print(f"\n📋 Processing file pair: {base_name}")
|
| 477 |
+
tables_count = self.process_single_file(json_path, png_path, output_dir)
|
| 478 |
+
if tables_count > 0:
|
| 479 |
+
total_tables += tables_count
|
| 480 |
+
files_processed += 1
|
| 481 |
+
else:
|
| 482 |
+
print(f"⚠️ Warning: No corresponding PNG file found for {json_file}")
|
| 483 |
+
|
| 484 |
+
print(f"\n🎉 Batch processing completed!")
|
| 485 |
+
print(f" 📁 Files processed: {files_processed}/{len(json_files)}")
|
| 486 |
+
print(f" 📊 Total tables processed: {total_tables}")
|
| 487 |
+
|
| 488 |
+
return total_tables
|
| 489 |
+
|
| 490 |
+
except Exception as e:
|
| 491 |
+
print(f"❌ Error in batch processing: {e}")
|
| 492 |
+
return 0
|
| 493 |
+
|
| 494 |
+
|
| 495 |
+
def main():
|
| 496 |
+
"""Main function with usage examples"""
|
| 497 |
+
|
| 498 |
+
# Create processor instance
|
| 499 |
+
processor = TableProcessor(padding_ratio=0.02) # 5% padding
|
| 500 |
+
|
| 501 |
+
print("🔧 JSON Table to XML Converter")
|
| 502 |
+
print("=" * 50)
|
| 503 |
+
|
| 504 |
+
# Example usage
|
| 505 |
+
print("\n📖 Usage Examples:")
|
| 506 |
+
print("1. Single file (all tables):")
|
| 507 |
+
print(" processor.process_single_file('page1.json', 'page1.png', 'output')")
|
| 508 |
+
|
| 509 |
+
print("\n2. Batch processing (all files, all tables):")
|
| 510 |
+
print(" processor.process_batch('input_folder', 'output_folder')")
|
| 511 |
+
|
| 512 |
+
print("\n3. Custom padding:")
|
| 513 |
+
print(" processor = TableProcessor(padding_ratio=0.08) # 8% padding")
|
| 514 |
+
processor.process_batch('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925', 'output_folder')
|
| 515 |
+
# processor.process_single_file('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).json', '/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).png', 'output')
|
| 516 |
+
if __name__ == "__main__":
|
| 517 |
+
main()
|
delete_name.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
folder = "output_folder" # thay bằng path tới folder của bạn
|
| 4 |
+
|
| 5 |
+
for filename in os.listdir(folder):
|
| 6 |
+
if "_cropped" in filename:
|
| 7 |
+
old_path = os.path.join(folder, filename)
|
| 8 |
+
new_filename = filename.replace("_cropped", "")
|
| 9 |
+
new_path = os.path.join(folder, new_filename)
|
| 10 |
+
os.rename(old_path, new_path)
|
| 11 |
+
|
| 12 |
+
print("✅ Đã xoá '_cropped' khỏi tên file.")
|
json2txt.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import cv2
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
page70_folder = "/home/tuvu/Downloads/9_11_2025"
|
| 6 |
+
save_folder = "vis_img"
|
| 7 |
+
os.makedirs(save_folder, exist_ok=True)
|
| 8 |
+
for name in os.listdir(page70_folder):
|
| 9 |
+
if name.endswith("json"):
|
| 10 |
+
json_name = os.path.join(page70_folder, name)
|
| 11 |
+
with open(json_name, "r") as f:
|
| 12 |
+
data = json.load(f)
|
| 13 |
+
|
| 14 |
+
items = data.get("items", [])
|
| 15 |
+
img_name = name.split('.')[0] + ".png"
|
| 16 |
+
print(img_name)
|
| 17 |
+
image = cv2.imread(os.path.join(page70_folder,img_name))
|
| 18 |
+
print(image.shape)
|
| 19 |
+
# Filter items where type == "label"
|
| 20 |
+
tables = [item for item in items if item.get("type") == "table"]
|
| 21 |
+
for table in tables:
|
| 22 |
+
x, y = 2 * int(table["x"]),2* int(table["y"])
|
| 23 |
+
w, h =2* table["width"], 2 * table["height"]
|
| 24 |
+
|
| 25 |
+
top_left = (x, y)
|
| 26 |
+
bottom_right = (x + w, y + h)
|
| 27 |
+
print(top_left, bottom_right)
|
| 28 |
+
cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2) # green box
|
| 29 |
+
|
| 30 |
+
# Save or show
|
| 31 |
+
cv2.imwrite(os.path.join(save_folder, img_name), image)
|
json2xml.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import xml.etree.ElementTree as ET
|
| 3 |
+
from xml.dom import minidom
|
| 4 |
+
|
| 5 |
+
def get_visible_cell_coords(table_properties, table_x, table_y):
|
| 6 |
+
"""Calculate coordinates for visible cells based on table properties"""
|
| 7 |
+
rows = table_properties.get("rows", 0)
|
| 8 |
+
columns = table_properties.get("columns", 0)
|
| 9 |
+
column_widths = table_properties.get("columnWidths", {})
|
| 10 |
+
row_heights = table_properties.get("rowHeights", {})
|
| 11 |
+
merged_cells = table_properties.get("mergedCells", {})
|
| 12 |
+
hidden_cells = table_properties.get("hiddenCells", {})
|
| 13 |
+
|
| 14 |
+
DEFAULT_WIDTH = 100
|
| 15 |
+
DEFAULT_HEIGHT = 30
|
| 16 |
+
|
| 17 |
+
def get_col_width(col):
|
| 18 |
+
return column_widths.get(str(col), DEFAULT_WIDTH)
|
| 19 |
+
|
| 20 |
+
def get_row_height(row):
|
| 21 |
+
return row_heights.get(str(row), DEFAULT_HEIGHT)
|
| 22 |
+
|
| 23 |
+
# Convert merge cell coordinates to set of all spanned cells (excluding top-left)
|
| 24 |
+
merged_spanned_cells = set()
|
| 25 |
+
for key, merge_info in merged_cells.items():
|
| 26 |
+
base_row, base_col = map(int, key.split('-'))
|
| 27 |
+
rowspan = merge_info.get('rowspan', 1)
|
| 28 |
+
colspan = merge_info.get('colspan', 1)
|
| 29 |
+
for r in range(base_row, base_row + rowspan):
|
| 30 |
+
for c in range(base_col, base_col + colspan):
|
| 31 |
+
if (r, c) != (base_row, base_col):
|
| 32 |
+
merged_spanned_cells.add((r, c))
|
| 33 |
+
|
| 34 |
+
result = {}
|
| 35 |
+
|
| 36 |
+
for row in range(rows):
|
| 37 |
+
for col in range(columns):
|
| 38 |
+
coord_key = f"{row}-{col}"
|
| 39 |
+
if hidden_cells.get(coord_key):
|
| 40 |
+
continue # Skip hidden cells
|
| 41 |
+
if (row, col) in merged_spanned_cells:
|
| 42 |
+
continue # Skip cells covered by merged cells
|
| 43 |
+
|
| 44 |
+
# Calculate x by summing widths of all previous columns
|
| 45 |
+
x = sum(get_col_width(c) for c in range(col))
|
| 46 |
+
y = sum(get_row_height(r) for r in range(row))
|
| 47 |
+
|
| 48 |
+
# Check if it's a merged cell origin
|
| 49 |
+
if coord_key in merged_cells:
|
| 50 |
+
colspan = merged_cells[coord_key].get("colspan", 1)
|
| 51 |
+
rowspan = merged_cells[coord_key].get("rowspan", 1)
|
| 52 |
+
else:
|
| 53 |
+
colspan = 1
|
| 54 |
+
rowspan = 1
|
| 55 |
+
|
| 56 |
+
width = sum(get_col_width(c) for c in range(col, col + colspan))
|
| 57 |
+
height = sum(get_row_height(r) for r in range(row, row + rowspan))
|
| 58 |
+
|
| 59 |
+
result[(row, col)] = {
|
| 60 |
+
"x": 2 * (x + table_x),
|
| 61 |
+
"y": 2 * (y + table_y),
|
| 62 |
+
"width": 2 * width,
|
| 63 |
+
"height": 2 * height
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
return result
|
| 67 |
+
|
| 68 |
+
def get_cell_borders(cell_data, table_properties):
|
| 69 |
+
"""Extract border information for a cell"""
|
| 70 |
+
# Get global table border settings
|
| 71 |
+
cell_borders = table_properties.get("cellBorders", {})
|
| 72 |
+
has_global_borders = cell_borders.get("all", False)
|
| 73 |
+
|
| 74 |
+
# Start with default border values
|
| 75 |
+
borders = {
|
| 76 |
+
"top": 1 if has_global_borders else 0,
|
| 77 |
+
"bottom": 1 if has_global_borders else 0,
|
| 78 |
+
"left": 1 if has_global_borders else 0,
|
| 79 |
+
"right": 1 if has_global_borders else 0
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# Check if cell has custom border styling
|
| 83 |
+
if cell_data and "cellStyle" in cell_data:
|
| 84 |
+
cell_style = cell_data["cellStyle"]
|
| 85 |
+
|
| 86 |
+
# Check each border side if explicitly defined
|
| 87 |
+
border_mappings = {
|
| 88 |
+
"borderTopWidth": "top",
|
| 89 |
+
"borderBottomWidth": "bottom",
|
| 90 |
+
"borderLeftWidth": "left",
|
| 91 |
+
"borderRightWidth": "right"
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
style_mappings = {
|
| 95 |
+
"borderTopStyle": "top",
|
| 96 |
+
"borderBottomStyle": "bottom",
|
| 97 |
+
"borderLeftStyle": "left",
|
| 98 |
+
"borderRightStyle": "right"
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# If any border width is defined, this cell has custom borders
|
| 102 |
+
has_custom_borders = any(key in cell_style for key in border_mappings.keys())
|
| 103 |
+
|
| 104 |
+
if has_custom_borders:
|
| 105 |
+
# Apply custom border settings
|
| 106 |
+
for width_key, border_side in border_mappings.items():
|
| 107 |
+
if width_key in cell_style:
|
| 108 |
+
# Check width
|
| 109 |
+
width = cell_style[width_key]
|
| 110 |
+
has_border = width > 0
|
| 111 |
+
|
| 112 |
+
# Check style if defined
|
| 113 |
+
style_key = width_key.replace("Width", "Style")
|
| 114 |
+
if style_key in cell_style:
|
| 115 |
+
style = cell_style[style_key]
|
| 116 |
+
if style == "none":
|
| 117 |
+
has_border = False
|
| 118 |
+
|
| 119 |
+
borders[border_side] = 1 if has_border else 0
|
| 120 |
+
|
| 121 |
+
return borders["top"], borders["bottom"], borders["left"], borders["right"]
|
| 122 |
+
|
| 123 |
+
def convert_json_to_xml(json_data, filename="table.jpg"):
|
| 124 |
+
"""Convert JSON table data to XML format"""
|
| 125 |
+
|
| 126 |
+
# Parse JSON if it's a string
|
| 127 |
+
if isinstance(json_data, str):
|
| 128 |
+
data = json.loads(json_data)
|
| 129 |
+
else:
|
| 130 |
+
data = json_data
|
| 131 |
+
|
| 132 |
+
# Handle list of tables (take first one)
|
| 133 |
+
if isinstance(data, list):
|
| 134 |
+
table_data = data[0]
|
| 135 |
+
else:
|
| 136 |
+
table_data = data
|
| 137 |
+
|
| 138 |
+
# Extract table information
|
| 139 |
+
properties = table_data.get("properties", {})
|
| 140 |
+
table_x = table_data.get("x", 0)
|
| 141 |
+
table_y = table_data.get("y", 0)
|
| 142 |
+
table_width = table_data.get("width", properties.get("width", 0))
|
| 143 |
+
table_height = table_data.get("height", properties.get("height", 0))
|
| 144 |
+
|
| 145 |
+
# Create XML root structure
|
| 146 |
+
root = ET.Element("document", filename=filename)
|
| 147 |
+
table_elem = ET.SubElement(root, "table")
|
| 148 |
+
|
| 149 |
+
# Add table coordinates (rectangle points)
|
| 150 |
+
x1, y1 = int(table_x), int(table_y)
|
| 151 |
+
x2, y2 = int(table_x + table_width), int(table_y + table_height)
|
| 152 |
+
table_coords = f"{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}"
|
| 153 |
+
ET.SubElement(table_elem, "Coords", points=table_coords)
|
| 154 |
+
|
| 155 |
+
# Get cell coordinates and data
|
| 156 |
+
cell_coords = get_visible_cell_coords(properties, table_x, table_y)
|
| 157 |
+
cell_data = properties.get("cellData", {})
|
| 158 |
+
merged_cells = properties.get("mergedCells", {})
|
| 159 |
+
|
| 160 |
+
# Create XML elements for each visible cell
|
| 161 |
+
for (row, col), coords in cell_coords.items():
|
| 162 |
+
cell_key = f"{row}-{col}"
|
| 163 |
+
current_cell_data = cell_data.get(cell_key, {})
|
| 164 |
+
|
| 165 |
+
# Calculate end positions for merged cells
|
| 166 |
+
if cell_key in merged_cells:
|
| 167 |
+
merge_info = merged_cells[cell_key]
|
| 168 |
+
end_row = row + merge_info.get("rowspan", 1) - 1
|
| 169 |
+
end_col = col + merge_info.get("colspan", 1) - 1
|
| 170 |
+
else:
|
| 171 |
+
end_row = row
|
| 172 |
+
end_col = col
|
| 173 |
+
|
| 174 |
+
# Create cell XML element
|
| 175 |
+
cell_elem = ET.SubElement(table_elem, "cell")
|
| 176 |
+
cell_elem.set("start-row", str(row))
|
| 177 |
+
cell_elem.set("end-row", str(end_row))
|
| 178 |
+
cell_elem.set("start-col", str(col))
|
| 179 |
+
cell_elem.set("end-col", str(end_col))
|
| 180 |
+
|
| 181 |
+
# Add cell coordinates
|
| 182 |
+
x1 = int(coords["x"])
|
| 183 |
+
y1 = int(coords["y"])
|
| 184 |
+
x2 = int(coords["x"] + coords["width"])
|
| 185 |
+
y2 = int(coords["y"] + coords["height"])
|
| 186 |
+
cell_coord_str = f"{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}"
|
| 187 |
+
ET.SubElement(cell_elem, "Coords", points=cell_coord_str)
|
| 188 |
+
|
| 189 |
+
# Add border information
|
| 190 |
+
top, bottom, left, right = get_cell_borders(current_cell_data, properties)
|
| 191 |
+
ET.SubElement(cell_elem, "Lines",
|
| 192 |
+
top=str(top),
|
| 193 |
+
bottom=str(bottom),
|
| 194 |
+
left=str(left),
|
| 195 |
+
right=str(right))
|
| 196 |
+
|
| 197 |
+
return root
|
| 198 |
+
|
| 199 |
+
def save_xml_to_file(xml_root, output_path):
|
| 200 |
+
"""Save XML to file with pretty formatting"""
|
| 201 |
+
# Convert to pretty-formatted string
|
| 202 |
+
rough_string = ET.tostring(xml_root, encoding='unicode')
|
| 203 |
+
reparsed = minidom.parseString(rough_string)
|
| 204 |
+
pretty_xml = reparsed.toprettyxml(indent=" ")
|
| 205 |
+
|
| 206 |
+
# Clean up extra whitespace lines
|
| 207 |
+
lines = [line for line in pretty_xml.split('\n') if line.strip()]
|
| 208 |
+
pretty_xml = '\n'.join(lines)
|
| 209 |
+
|
| 210 |
+
# Write to file
|
| 211 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 212 |
+
f.write(pretty_xml)
|
| 213 |
+
|
| 214 |
+
def convert_json_file_to_xml(json_file_path, xml_file_path, filename="table.jpg"):
|
| 215 |
+
"""Convert JSON file to XML file"""
|
| 216 |
+
try:
|
| 217 |
+
# Read JSON file
|
| 218 |
+
with open(json_file_path, 'r', encoding='utf-8') as f:
|
| 219 |
+
json_data = json.load(f)
|
| 220 |
+
json_data = json_data.get('items')
|
| 221 |
+
# Convert to XML
|
| 222 |
+
xml_root = convert_json_to_xml(json_data, filename)
|
| 223 |
+
|
| 224 |
+
# Save XML file
|
| 225 |
+
save_xml_to_file(xml_root, xml_file_path)
|
| 226 |
+
|
| 227 |
+
print(f"✅ Successfully converted {json_file_path} to {xml_file_path}")
|
| 228 |
+
return True
|
| 229 |
+
|
| 230 |
+
except Exception as e:
|
| 231 |
+
print(f"❌ Error converting file: {e}")
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
# Example usage and testing
|
| 235 |
+
if __name__ == "__main__":
|
| 236 |
+
|
| 237 |
+
import os
|
| 238 |
+
folder = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/test_json"
|
| 239 |
+
for name in os.listdir(folder):
|
| 240 |
+
if name.endswith('json'):
|
| 241 |
+
json_name = os.path.join(folder, name)
|
| 242 |
+
xml_name = name.replace('.json' , '.xml')
|
| 243 |
+
convert_json_file_to_xml(json_name,xml_name, xml_name.replace('.xml','.png'))
|
rename.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
folder_json = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925"
|
| 3 |
+
|
| 4 |
+
for name in os.listdir(folder_json):
|
| 5 |
+
if name.endswith('.png'):
|
| 6 |
+
new_name = name.split('-')[0] + '.png'
|
| 7 |
+
os.rename(os.path.join(folder_json, name), os.path.join(folder_json, new_name))
|
| 8 |
+
elif name.endswith('.json'):
|
| 9 |
+
new_name = name.split('-')[1] + '.json'
|
| 10 |
+
os.rename(os.path.join(folder_json, name), os.path.join(folder_json, new_name))
|
resize.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# Input and output folders
|
| 5 |
+
input_folder = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39"
|
| 6 |
+
output_folder = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/train_table_1209"
|
| 7 |
+
|
| 8 |
+
os.makedirs(output_folder, exist_ok=True)
|
| 9 |
+
|
| 10 |
+
# Loop through all files in the folder
|
| 11 |
+
for filename in os.listdir(input_folder):
|
| 12 |
+
file_path = os.path.join(input_folder, filename)
|
| 13 |
+
|
| 14 |
+
# Skip non-image files
|
| 15 |
+
if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
|
| 16 |
+
continue
|
| 17 |
+
|
| 18 |
+
# Open and resize
|
| 19 |
+
with Image.open(file_path) as img:
|
| 20 |
+
w, h = img.size
|
| 21 |
+
resized = img.resize((w // 2, h // 2), Image.LANCZOS)
|
| 22 |
+
|
| 23 |
+
# Save to output folder
|
| 24 |
+
resized.save(os.path.join(output_folder, filename))
|
| 25 |
+
|
| 26 |
+
print("✅ Done! All images resized to half.")
|
vis_json_cell.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import cv2
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def get_visible_cell_coords(table_properties, table_x, table_y):
|
| 6 |
+
rows = table_properties.get("rows", 0)
|
| 7 |
+
columns = table_properties.get("columns", 0)
|
| 8 |
+
column_widths = table_properties.get("columnWidths", {})
|
| 9 |
+
row_heights = table_properties.get("rowHeights", {})
|
| 10 |
+
merged_cells = table_properties.get("mergedCells", {})
|
| 11 |
+
hidden_cells = table_properties.get("hiddenCells", {})
|
| 12 |
+
|
| 13 |
+
DEFAULT_WIDTH = 100
|
| 14 |
+
DEFAULT_HEIGHT = 30
|
| 15 |
+
|
| 16 |
+
def get_col_width(col):
|
| 17 |
+
return column_widths.get(str(col), DEFAULT_WIDTH)
|
| 18 |
+
|
| 19 |
+
def get_row_height(row):
|
| 20 |
+
return row_heights.get(str(row), DEFAULT_HEIGHT)
|
| 21 |
+
|
| 22 |
+
# Convert merge cell coordinates to set of all spanned cells (excluding top-left)
|
| 23 |
+
merged_spanned_cells = set()
|
| 24 |
+
for key, merge_info in merged_cells.items():
|
| 25 |
+
base_row, base_col = map(int, key.split('-'))
|
| 26 |
+
rowspan = merge_info.get('rowspan', 1)
|
| 27 |
+
colspan = merge_info.get('colspan', 1)
|
| 28 |
+
for r in range(base_row, base_row + rowspan):
|
| 29 |
+
for c in range(base_col, base_col + colspan):
|
| 30 |
+
if (r, c) != (base_row, base_col):
|
| 31 |
+
merged_spanned_cells.add((r, c))
|
| 32 |
+
|
| 33 |
+
result = {}
|
| 34 |
+
|
| 35 |
+
for row in range(rows):
|
| 36 |
+
for col in range(columns):
|
| 37 |
+
coord_key = f"{row}-{col}"
|
| 38 |
+
if hidden_cells.get(coord_key):
|
| 39 |
+
continue # Skip hidden cells
|
| 40 |
+
if (row, col) in merged_spanned_cells:
|
| 41 |
+
continue # Skip cells covered by merged cells
|
| 42 |
+
|
| 43 |
+
# Calculate x by summing widths of all previous columns
|
| 44 |
+
x = sum(get_col_width(c) for c in range(col))
|
| 45 |
+
y = sum(get_row_height(r) for r in range(row))
|
| 46 |
+
|
| 47 |
+
# Check if it's a merged cell origin
|
| 48 |
+
if coord_key in merged_cells:
|
| 49 |
+
colspan = merged_cells[coord_key].get("colspan", 1)
|
| 50 |
+
rowspan = merged_cells[coord_key].get("rowspan", 1)
|
| 51 |
+
else:
|
| 52 |
+
colspan = 1
|
| 53 |
+
rowspan = 1
|
| 54 |
+
|
| 55 |
+
width = sum(get_col_width(c) for c in range(col, col + colspan))
|
| 56 |
+
height = sum(get_row_height(r) for r in range(row, row + rowspan))
|
| 57 |
+
|
| 58 |
+
result[(row, col)] = {
|
| 59 |
+
"x": int(x + table_x),
|
| 60 |
+
"y": int(y + table_y),
|
| 61 |
+
"width": int(width),
|
| 62 |
+
"height": int(height)
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
+
folder_path = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/train_table_1209"
|
| 68 |
+
save_folder = "cell_vis"
|
| 69 |
+
os.makedirs(save_folder, exist_ok=True)
|
| 70 |
+
for name in os.listdir(folder_path):
|
| 71 |
+
if name.endswith("json"):
|
| 72 |
+
json_file = os.path.join(folder_path, name)
|
| 73 |
+
with open(json_file, "r") as f:
|
| 74 |
+
data = json.load(f)
|
| 75 |
+
|
| 76 |
+
img_name = name.split('.')[0] + ".png"
|
| 77 |
+
print(img_name)
|
| 78 |
+
image = cv2.imread(os.path.join(folder_path,img_name))
|
| 79 |
+
items = data.get('items')
|
| 80 |
+
for index in range(len(items)):
|
| 81 |
+
|
| 82 |
+
table = items[index]
|
| 83 |
+
if table.get('type') != 'table':
|
| 84 |
+
continue
|
| 85 |
+
table_x = table.get('x')
|
| 86 |
+
table_y = table.get('y')
|
| 87 |
+
|
| 88 |
+
table_prob = table.get('properties')
|
| 89 |
+
boxes = get_visible_cell_coords(table_prob, table_x, table_y)
|
| 90 |
+
|
| 91 |
+
for key, box in boxes.items():
|
| 92 |
+
row, col = key
|
| 93 |
+
x, y, w, h = int(box["x"]), int(box["y"]), int(box["width"]), int(box["height"])
|
| 94 |
+
|
| 95 |
+
top_left = (x, y)
|
| 96 |
+
bottom_right = (x + w, y + h)
|
| 97 |
+
|
| 98 |
+
cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)
|
| 99 |
+
# cv2.putText(image, f"{row},{col}", (x + 5, y + 20),
|
| 100 |
+
# cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
|
| 101 |
+
|
| 102 |
+
# Save or show
|
| 103 |
+
cv2.imwrite(os.path.join(save_folder, img_name), image)
|