kiaisoft commited on
Commit
66180d7
·
verified ·
1 Parent(s): 4812f30

Upload 7 files

Browse files
Files changed (7) hide show
  1. crop_tables.py +517 -0
  2. delete_name.py +12 -0
  3. json2txt.py +31 -0
  4. json2xml.py +243 -0
  5. rename.py +10 -0
  6. resize.py +26 -0
  7. vis_json_cell.py +103 -0
crop_tables.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ JSON Table to XML Converter
4
+ Processes JSON files containing table data and corresponding PNG images
5
+ to create cropped sub-table images and XML coordinate files for ALL tables found.
6
+ """
7
+
8
+ import json
9
+ import xml.etree.ElementTree as ET
10
+ from xml.dom import minidom
11
+ import os
12
+ from typing import Dict, List, Tuple, Any, Optional
13
+
14
+ class TableProcessor:
15
+ """Main class for processing table data from JSON to XML with image cropping"""
16
+
17
+ def __init__(self, padding_ratio: float = 0.05):
18
+ """
19
+ Initialize the table processor
20
+
21
+ Args:
22
+ padding_ratio: Padding around table as ratio of min(width, height)
23
+ """
24
+ self.padding_ratio = padding_ratio
25
+ self.DEFAULT_WIDTH = 100
26
+ self.DEFAULT_HEIGHT = 30
27
+
28
+ def extract_tables_from_json(self, json_data: Any) -> List[Dict]:
29
+ """
30
+ Extract all table items from JSON data
31
+
32
+ Args:
33
+ json_data: Parsed JSON data (dict or list)
34
+
35
+ Returns:
36
+ List of table dictionaries
37
+ """
38
+ if isinstance(json_data, list):
39
+ # Filter items with type="table"
40
+ tables = [item for item in json_data if item.get("type") == "table"]
41
+ elif isinstance(json_data, dict) and json_data.get("type") == "table":
42
+ # Single table item
43
+ tables = [json_data]
44
+ else:
45
+ tables = []
46
+
47
+ return tables
48
+
49
+ def calculate_cell_coordinates(self, table_properties: Dict, table_x: float, table_y: float) -> Dict[Tuple[int, int], Dict]:
50
+ """
51
+ Calculate coordinates for all visible cells in the table
52
+
53
+ Args:
54
+ table_properties: Table properties from JSON
55
+ table_x: Table X position in original image
56
+ table_y: Table Y position in original image
57
+
58
+ Returns:
59
+ Dictionary mapping (row, col) to coordinate info
60
+ """
61
+ rows = table_properties.get("rows", 0)
62
+ columns = table_properties.get("columns", 0)
63
+ column_widths = table_properties.get("columnWidths", {})
64
+ row_heights = table_properties.get("rowHeights", {})
65
+ merged_cells = table_properties.get("mergedCells", {})
66
+ hidden_cells = table_properties.get("hiddenCells", {})
67
+
68
+ def get_col_width(col: int) -> int:
69
+ return column_widths.get(str(col), self.DEFAULT_WIDTH)
70
+
71
+ def get_row_height(row: int) -> int:
72
+ return row_heights.get(str(row), self.DEFAULT_HEIGHT)
73
+
74
+ # Build set of cells that are covered by merged cells (excluding origin)
75
+ merged_spanned_cells = set()
76
+ for cell_key, merge_info in merged_cells.items():
77
+ base_row, base_col = map(int, cell_key.split('-'))
78
+ rowspan = merge_info.get('rowspan', 1)
79
+ colspan = merge_info.get('colspan', 1)
80
+
81
+ # Add all spanned cells except the origin cell
82
+ for r in range(base_row, base_row + rowspan):
83
+ for c in range(base_col, base_col + colspan):
84
+ if (r, c) != (base_row, base_col):
85
+ merged_spanned_cells.add((r, c))
86
+
87
+ cell_coords = {}
88
+
89
+ for row in range(rows):
90
+ for col in range(columns):
91
+ cell_key = f"{row}-{col}"
92
+
93
+ # Skip hidden cells and cells covered by merges
94
+ if hidden_cells.get(cell_key) or (row, col) in merged_spanned_cells:
95
+ continue
96
+
97
+ # Calculate position by summing previous column widths/row heights
98
+ x = sum(get_col_width(c) for c in range(col))
99
+ y = sum(get_row_height(r) for r in range(row))
100
+
101
+ # Check if this cell is a merge origin
102
+ if cell_key in merged_cells:
103
+ merge_info = merged_cells[cell_key]
104
+ colspan = merge_info.get("colspan", 1)
105
+ rowspan = merge_info.get("rowspan", 1)
106
+ else:
107
+ colspan = 1
108
+ rowspan = 1
109
+
110
+ # Calculate cell dimensions
111
+ width = sum(get_col_width(c) for c in range(col, col + colspan))
112
+ height = sum(get_row_height(r) for r in range(row, row + rowspan))
113
+
114
+ # Store coordinates (with 2x scaling factor from original code)
115
+ cell_coords[(row, col)] = {
116
+ "x": (x + table_x),
117
+ "y": (y + table_y),
118
+ "width": width,
119
+ "height": height,
120
+ "colspan": colspan,
121
+ "rowspan": rowspan
122
+ }
123
+
124
+ return cell_coords
125
+
126
+ def determine_cell_borders(self, cell_data: Optional[Dict], table_properties: Dict) -> Tuple[int, int, int, int]:
127
+ """
128
+ Determine border visibility for each side of a cell
129
+
130
+ Args:
131
+ cell_data: Individual cell data from JSON
132
+ table_properties: Global table properties
133
+
134
+ Returns:
135
+ Tuple of (top, bottom, left, right) border flags (0 or 1)
136
+ """
137
+ # Get global border settings
138
+ cell_borders = table_properties.get("cellBorders", {})
139
+ has_global_borders = cell_borders.get("all", False)
140
+
141
+ # Default borders based on global setting
142
+ borders = {
143
+ "top": 1 if has_global_borders else 0,
144
+ "bottom": 1 if has_global_borders else 0,
145
+ "left": 1 if has_global_borders else 0,
146
+ "right": 1 if has_global_borders else 0
147
+ }
148
+
149
+ # Check for cell-specific border overrides
150
+ if cell_data and "cellStyle" in cell_data:
151
+ cell_style = cell_data["cellStyle"]
152
+
153
+ # Border property mappings
154
+ border_mappings = {
155
+ "borderTopWidth": "top",
156
+ "borderBottomWidth": "bottom",
157
+ "borderLeftWidth": "left",
158
+ "borderRightWidth": "right"
159
+ }
160
+
161
+ # If any border width property exists, this cell has custom borders
162
+ has_custom_borders = any(key in cell_style for key in border_mappings.keys())
163
+
164
+ if has_custom_borders:
165
+ # Apply custom border settings for each side
166
+ for width_key, border_side in border_mappings.items():
167
+ if width_key in cell_style:
168
+ # Check border width
169
+ width = cell_style[width_key]
170
+ has_border = width > 0
171
+
172
+ # Check border style if specified
173
+ style_key = width_key.replace("Width", "Style")
174
+ if style_key in cell_style:
175
+ style = cell_style[style_key]
176
+ if style == "none":
177
+ has_border = False
178
+
179
+ borders[border_side] = 1 if has_border else 0
180
+
181
+ return borders["top"], borders["bottom"], borders["left"], borders["right"]
182
+
183
+ def convert_table_to_xml(self, table_data: Dict, output_filename: str) -> Tuple[ET.Element, Dict]:
184
+ """
185
+ Convert a single table to XML format with crop information
186
+
187
+ Args:
188
+ table_data: Single table data from JSON
189
+ output_filename: Filename to reference in XML
190
+
191
+ Returns:
192
+ Tuple of (XML root element, crop info dictionary)
193
+ """
194
+ # Extract table properties
195
+ properties = table_data.get("properties", {})
196
+ table_x = table_data.get("x", 0)
197
+ table_y = table_data.get("y", 0)
198
+ table_width = table_data.get("width", properties.get("width", 0))
199
+ table_height = table_data.get("height", properties.get("height", 0))
200
+
201
+ # Calculate padding based on table dimensions
202
+ min_dimension = min(table_width, table_height)
203
+ padding = int(min_dimension * self.padding_ratio)
204
+
205
+ # Calculate crop area
206
+ crop_x = table_x - padding
207
+ crop_y = table_y - padding
208
+ crop_width = table_width + (2 * padding)
209
+ crop_height = table_height + (2 * padding)
210
+
211
+ # Create XML structure
212
+ root = ET.Element("document", filename=output_filename)
213
+ table_elem = ET.SubElement(root, "table")
214
+
215
+ # Add table coordinates relative to cropped image
216
+ table_x_in_crop = padding
217
+ table_y_in_crop = padding
218
+ table_coords = f"{table_x_in_crop},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop} {table_x_in_crop + table_width},{table_y_in_crop + table_height} {table_x_in_crop},{table_y_in_crop + table_height}"
219
+ ET.SubElement(table_elem, "Coords", points=table_coords)
220
+
221
+ # Get cell coordinates and data
222
+ cell_coords = self.calculate_cell_coordinates(properties, table_x, table_y)
223
+ cell_data = properties.get("cellData", {})
224
+ merged_cells = properties.get("mergedCells", {})
225
+
226
+ # Create XML elements for each cell
227
+ for (row, col), coords in cell_coords.items():
228
+ cell_key = f"{row}-{col}"
229
+ current_cell_data = cell_data.get(cell_key, {})
230
+
231
+ # Determine cell span (for merged cells)
232
+ end_row = row + coords["rowspan"] - 1
233
+ end_col = col + coords["colspan"] - 1
234
+
235
+ # Create cell element
236
+ cell_elem = ET.SubElement(table_elem, "cell")
237
+ cell_elem.set("start-row", str(row))
238
+ cell_elem.set("end-row", str(end_row))
239
+ cell_elem.set("start-col", str(col))
240
+ cell_elem.set("end-col", str(end_col))
241
+
242
+ # Convert coordinates to cropped image space
243
+ original_x1 = int(coords["x"])
244
+ original_y1 = int(coords["y"])
245
+ original_x2 = int(coords["x"] + coords["width"])
246
+ original_y2 = int(coords["y"] + coords["height"])
247
+
248
+ # Transform to cropped coordinates
249
+ crop_x1 = original_x1 - int( crop_x)
250
+ crop_y1 = original_y1 - int( crop_y)
251
+ crop_x2 = original_x2 - int( crop_x)
252
+ crop_y2 = original_y2 - int( crop_y)
253
+
254
+ cell_coords_str = f"{crop_x1},{crop_y1} {crop_x2},{crop_y1} {crop_x2},{crop_y2} {crop_x1},{crop_y2}"
255
+ ET.SubElement(cell_elem, "Coords", points=cell_coords_str)
256
+
257
+ # Add border information
258
+ top, bottom, left, right = self.determine_cell_borders(current_cell_data, properties)
259
+ ET.SubElement(cell_elem, "Lines",
260
+ top=str(top),
261
+ bottom=str(bottom),
262
+ left=str(left),
263
+ right=str(right))
264
+
265
+ # Prepare crop information
266
+ crop_info = {
267
+ "crop_x": crop_x,
268
+ "crop_y": crop_y,
269
+ "crop_width": crop_width,
270
+ "crop_height": crop_height,
271
+ "padding": padding,
272
+ "table_id": table_data.get("id", "unknown")
273
+ }
274
+
275
+ return root, crop_info
276
+
277
+ def save_xml(self, xml_root: ET.Element, output_path: str) -> bool:
278
+ """
279
+ Save XML to file with pretty formatting
280
+
281
+ Args:
282
+ xml_root: XML root element
283
+ output_path: Path to save XML file
284
+
285
+ Returns:
286
+ True if successful, False otherwise
287
+ """
288
+ try:
289
+ # Convert to pretty-formatted string
290
+ rough_string = ET.tostring(xml_root, encoding='unicode')
291
+ reparsed = minidom.parseString(rough_string)
292
+ pretty_xml = reparsed.toprettyxml(indent=" ")
293
+
294
+ # Clean up extra whitespace lines
295
+ lines = [line for line in pretty_xml.split('\n') if line.strip()]
296
+ pretty_xml = '\n'.join(lines)
297
+
298
+ # Write to file
299
+ with open(output_path, 'w', encoding='utf-8') as f:
300
+ f.write(pretty_xml)
301
+
302
+ return True
303
+ except Exception as e:
304
+ print(f"❌ Error saving XML to {output_path}: {e}")
305
+ return False
306
+
307
+ def crop_image(self, image_path: str, crop_info: Dict, output_path: str) -> bool:
308
+ """
309
+ Crop image based on crop information
310
+
311
+ Args:
312
+ image_path: Path to original image
313
+ crop_info: Crop information dictionary
314
+ output_path: Path to save cropped image
315
+
316
+ Returns:
317
+ True if successful, False otherwise
318
+ """
319
+ try:
320
+ from PIL import Image
321
+
322
+ with Image.open(image_path) as img:
323
+ # Ensure crop coordinates are within image bounds
324
+ left = max(0, int(crop_info['crop_x']))
325
+ top = max(0, int(crop_info['crop_y']))
326
+ right = min(img.width, int(crop_info['crop_x'] + crop_info['crop_width']))
327
+ bottom = min(img.height, int(crop_info['crop_y'] + crop_info['crop_height']))
328
+
329
+ # Crop and save
330
+ cropped_img = img.crop((left, top, right, bottom))
331
+ cropped_img.save(output_path)
332
+
333
+ return True
334
+
335
+ except ImportError:
336
+ print("❌ PIL/Pillow not installed. Run: pip install Pillow")
337
+ return False
338
+ except Exception as e:
339
+ print(f"❌ Error cropping image: {e}")
340
+ return False
341
+
342
+ def generate_output_filenames(self, base_name: str, table_index: int, table_id: str, total_tables: int, output_dir: str) -> Tuple[str, str, str]:
343
+ """
344
+ Generate appropriate output filenames for XML and image files
345
+
346
+ Args:
347
+ base_name: Base filename without extension
348
+ table_index: Index of current table
349
+ table_id: ID of the table from JSON
350
+ total_tables: Total number of tables in the file
351
+ output_dir: Output directory
352
+
353
+ Returns:
354
+ Tuple of (xml_path, image_path, image_filename_for_xml)
355
+ """
356
+ if total_tables > 1:
357
+ # Multiple tables: add index and ID to filename
358
+ clean_table_id = table_id.replace('/', '_').replace('\\', '_') # Clean ID for filename
359
+ xml_filename = f"{base_name}_table_{table_index}_{clean_table_id}.xml"
360
+ image_filename = f"{base_name}_table_{table_index}_{clean_table_id}.png"
361
+ else:
362
+ # Single table: use simple filename
363
+ xml_filename = f"{base_name}.xml"
364
+ image_filename = f"{base_name}_cropped.png"
365
+
366
+ xml_path = os.path.join(output_dir, xml_filename)
367
+ image_path = os.path.join(output_dir, image_filename)
368
+
369
+ return xml_path, image_path, image_filename
370
+
371
+ def process_single_file(self, json_path: str, image_path: str, output_dir: str = "output") -> int:
372
+ """
373
+ Process a single JSON+PNG file pair to extract all tables
374
+
375
+ Args:
376
+ json_path: Path to JSON file
377
+ image_path: Path to PNG image file
378
+ output_dir: Directory for output files
379
+
380
+ Returns:
381
+ Number of tables successfully processed
382
+ """
383
+ try:
384
+ # Create output directory
385
+ os.makedirs(output_dir, exist_ok=True)
386
+
387
+ # Read and parse JSON
388
+ with open(json_path, 'r', encoding='utf-8') as f:
389
+ json_data = json.load(f)
390
+ json_data = json_data.get('items')
391
+ # Extract all tables
392
+ tables = self.extract_tables_from_json(json_data)
393
+
394
+ if not tables:
395
+ print(f"❌ No tables found in {json_path}")
396
+ return 0
397
+
398
+ print(f"📋 Found {len(tables)} table(s) in {json_path}")
399
+
400
+ base_name = os.path.splitext(os.path.basename(json_path))[0]
401
+ successful_count = 0
402
+
403
+ # Process each table
404
+ for table_index, table_data in enumerate(tables):
405
+ try:
406
+ table_id = table_data.get('id', f'table_{table_index}')
407
+ print(f" 🔄 Processing table {table_index + 1}/{len(tables)} (id: {table_id})")
408
+
409
+ # Generate filenames
410
+ xml_path, image_output_path, image_filename = self.generate_output_filenames(
411
+ base_name, table_index, table_id, len(tables), output_dir
412
+ )
413
+
414
+ # Convert table to XML
415
+ xml_root, crop_info = self.convert_table_to_xml(table_data, image_filename)
416
+
417
+ # Save XML file
418
+ if not self.save_xml(xml_root, xml_path):
419
+ continue
420
+
421
+ # Crop and save image
422
+ if not self.crop_image(image_path, crop_info, image_output_path):
423
+ continue
424
+
425
+ print(f" ✅ Table {table_index + 1} completed:")
426
+ print(f" 📄 XML: {xml_path}")
427
+ print(f" 🖼️ Image: {image_output_path}")
428
+ print(f" 📏 Padding: {crop_info['padding']}px ({self.padding_ratio:.1%})")
429
+
430
+ successful_count += 1
431
+
432
+ except Exception as e:
433
+ print(f" ❌ Error processing table {table_index + 1}: {e}")
434
+ continue
435
+
436
+ print(f"✅ Successfully processed {successful_count}/{len(tables)} tables from {json_path}")
437
+ return successful_count
438
+
439
+ except Exception as e:
440
+ print(f"❌ Error processing file {json_path}: {e}")
441
+ return 0
442
+
443
+ def process_batch(self, input_dir: str, output_dir: str = "output") -> int:
444
+ """
445
+ Batch process all JSON+PNG pairs in a directory
446
+
447
+ Args:
448
+ input_dir: Directory containing JSON and PNG files
449
+ output_dir: Directory for output files
450
+
451
+ Returns:
452
+ Total number of tables processed across all files
453
+ """
454
+ try:
455
+ # Find all JSON files
456
+ json_files = [f for f in os.listdir(input_dir) if f.endswith('.json')]
457
+
458
+ if not json_files:
459
+ print(f"❌ No JSON files found in {input_dir}")
460
+ return 0
461
+
462
+ print(f"🗂️ Found {len(json_files)} JSON files to process")
463
+
464
+ total_tables = 0
465
+ files_processed = 0
466
+
467
+ for json_file in json_files:
468
+ # Look for corresponding PNG file
469
+ base_name = os.path.splitext(json_file)[0]
470
+ png_file = f"{base_name}.png"
471
+
472
+ json_path = os.path.join(input_dir, json_file)
473
+ png_path = os.path.join(input_dir, png_file)
474
+
475
+ if os.path.exists(png_path):
476
+ print(f"\n📋 Processing file pair: {base_name}")
477
+ tables_count = self.process_single_file(json_path, png_path, output_dir)
478
+ if tables_count > 0:
479
+ total_tables += tables_count
480
+ files_processed += 1
481
+ else:
482
+ print(f"⚠️ Warning: No corresponding PNG file found for {json_file}")
483
+
484
+ print(f"\n🎉 Batch processing completed!")
485
+ print(f" 📁 Files processed: {files_processed}/{len(json_files)}")
486
+ print(f" 📊 Total tables processed: {total_tables}")
487
+
488
+ return total_tables
489
+
490
+ except Exception as e:
491
+ print(f"❌ Error in batch processing: {e}")
492
+ return 0
493
+
494
+
495
+ def main():
496
+ """Main function with usage examples"""
497
+
498
+ # Create processor instance
499
+ processor = TableProcessor(padding_ratio=0.02) # 5% padding
500
+
501
+ print("🔧 JSON Table to XML Converter")
502
+ print("=" * 50)
503
+
504
+ # Example usage
505
+ print("\n📖 Usage Examples:")
506
+ print("1. Single file (all tables):")
507
+ print(" processor.process_single_file('page1.json', 'page1.png', 'output')")
508
+
509
+ print("\n2. Batch processing (all files, all tables):")
510
+ print(" processor.process_batch('input_folder', 'output_folder')")
511
+
512
+ print("\n3. Custom padding:")
513
+ print(" processor = TableProcessor(padding_ratio=0.08) # 8% padding")
514
+ processor.process_batch('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925', 'output_folder')
515
+ # processor.process_single_file('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).json', '/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39/39(draft 13).png', 'output')
516
+ if __name__ == "__main__":
517
+ main()
delete_name.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ folder = "output_folder" # thay bằng path tới folder của bạn
4
+
5
+ for filename in os.listdir(folder):
6
+ if "_cropped" in filename:
7
+ old_path = os.path.join(folder, filename)
8
+ new_filename = filename.replace("_cropped", "")
9
+ new_path = os.path.join(folder, new_filename)
10
+ os.rename(old_path, new_path)
11
+
12
+ print("✅ Đã xoá '_cropped' khỏi tên file.")
json2txt.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import cv2
3
+ import os
4
+
5
+ page70_folder = "/home/tuvu/Downloads/9_11_2025"
6
+ save_folder = "vis_img"
7
+ os.makedirs(save_folder, exist_ok=True)
8
+ for name in os.listdir(page70_folder):
9
+ if name.endswith("json"):
10
+ json_name = os.path.join(page70_folder, name)
11
+ with open(json_name, "r") as f:
12
+ data = json.load(f)
13
+
14
+ items = data.get("items", [])
15
+ img_name = name.split('.')[0] + ".png"
16
+ print(img_name)
17
+ image = cv2.imread(os.path.join(page70_folder,img_name))
18
+ print(image.shape)
19
+ # Filter items where type == "label"
20
+ tables = [item for item in items if item.get("type") == "table"]
21
+ for table in tables:
22
+ x, y = 2 * int(table["x"]),2* int(table["y"])
23
+ w, h =2* table["width"], 2 * table["height"]
24
+
25
+ top_left = (x, y)
26
+ bottom_right = (x + w, y + h)
27
+ print(top_left, bottom_right)
28
+ cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2) # green box
29
+
30
+ # Save or show
31
+ cv2.imwrite(os.path.join(save_folder, img_name), image)
json2xml.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import xml.etree.ElementTree as ET
3
+ from xml.dom import minidom
4
+
5
+ def get_visible_cell_coords(table_properties, table_x, table_y):
6
+ """Calculate coordinates for visible cells based on table properties"""
7
+ rows = table_properties.get("rows", 0)
8
+ columns = table_properties.get("columns", 0)
9
+ column_widths = table_properties.get("columnWidths", {})
10
+ row_heights = table_properties.get("rowHeights", {})
11
+ merged_cells = table_properties.get("mergedCells", {})
12
+ hidden_cells = table_properties.get("hiddenCells", {})
13
+
14
+ DEFAULT_WIDTH = 100
15
+ DEFAULT_HEIGHT = 30
16
+
17
+ def get_col_width(col):
18
+ return column_widths.get(str(col), DEFAULT_WIDTH)
19
+
20
+ def get_row_height(row):
21
+ return row_heights.get(str(row), DEFAULT_HEIGHT)
22
+
23
+ # Convert merge cell coordinates to set of all spanned cells (excluding top-left)
24
+ merged_spanned_cells = set()
25
+ for key, merge_info in merged_cells.items():
26
+ base_row, base_col = map(int, key.split('-'))
27
+ rowspan = merge_info.get('rowspan', 1)
28
+ colspan = merge_info.get('colspan', 1)
29
+ for r in range(base_row, base_row + rowspan):
30
+ for c in range(base_col, base_col + colspan):
31
+ if (r, c) != (base_row, base_col):
32
+ merged_spanned_cells.add((r, c))
33
+
34
+ result = {}
35
+
36
+ for row in range(rows):
37
+ for col in range(columns):
38
+ coord_key = f"{row}-{col}"
39
+ if hidden_cells.get(coord_key):
40
+ continue # Skip hidden cells
41
+ if (row, col) in merged_spanned_cells:
42
+ continue # Skip cells covered by merged cells
43
+
44
+ # Calculate x by summing widths of all previous columns
45
+ x = sum(get_col_width(c) for c in range(col))
46
+ y = sum(get_row_height(r) for r in range(row))
47
+
48
+ # Check if it's a merged cell origin
49
+ if coord_key in merged_cells:
50
+ colspan = merged_cells[coord_key].get("colspan", 1)
51
+ rowspan = merged_cells[coord_key].get("rowspan", 1)
52
+ else:
53
+ colspan = 1
54
+ rowspan = 1
55
+
56
+ width = sum(get_col_width(c) for c in range(col, col + colspan))
57
+ height = sum(get_row_height(r) for r in range(row, row + rowspan))
58
+
59
+ result[(row, col)] = {
60
+ "x": 2 * (x + table_x),
61
+ "y": 2 * (y + table_y),
62
+ "width": 2 * width,
63
+ "height": 2 * height
64
+ }
65
+
66
+ return result
67
+
68
+ def get_cell_borders(cell_data, table_properties):
69
+ """Extract border information for a cell"""
70
+ # Get global table border settings
71
+ cell_borders = table_properties.get("cellBorders", {})
72
+ has_global_borders = cell_borders.get("all", False)
73
+
74
+ # Start with default border values
75
+ borders = {
76
+ "top": 1 if has_global_borders else 0,
77
+ "bottom": 1 if has_global_borders else 0,
78
+ "left": 1 if has_global_borders else 0,
79
+ "right": 1 if has_global_borders else 0
80
+ }
81
+
82
+ # Check if cell has custom border styling
83
+ if cell_data and "cellStyle" in cell_data:
84
+ cell_style = cell_data["cellStyle"]
85
+
86
+ # Check each border side if explicitly defined
87
+ border_mappings = {
88
+ "borderTopWidth": "top",
89
+ "borderBottomWidth": "bottom",
90
+ "borderLeftWidth": "left",
91
+ "borderRightWidth": "right"
92
+ }
93
+
94
+ style_mappings = {
95
+ "borderTopStyle": "top",
96
+ "borderBottomStyle": "bottom",
97
+ "borderLeftStyle": "left",
98
+ "borderRightStyle": "right"
99
+ }
100
+
101
+ # If any border width is defined, this cell has custom borders
102
+ has_custom_borders = any(key in cell_style for key in border_mappings.keys())
103
+
104
+ if has_custom_borders:
105
+ # Apply custom border settings
106
+ for width_key, border_side in border_mappings.items():
107
+ if width_key in cell_style:
108
+ # Check width
109
+ width = cell_style[width_key]
110
+ has_border = width > 0
111
+
112
+ # Check style if defined
113
+ style_key = width_key.replace("Width", "Style")
114
+ if style_key in cell_style:
115
+ style = cell_style[style_key]
116
+ if style == "none":
117
+ has_border = False
118
+
119
+ borders[border_side] = 1 if has_border else 0
120
+
121
+ return borders["top"], borders["bottom"], borders["left"], borders["right"]
122
+
123
+ def convert_json_to_xml(json_data, filename="table.jpg"):
124
+ """Convert JSON table data to XML format"""
125
+
126
+ # Parse JSON if it's a string
127
+ if isinstance(json_data, str):
128
+ data = json.loads(json_data)
129
+ else:
130
+ data = json_data
131
+
132
+ # Handle list of tables (take first one)
133
+ if isinstance(data, list):
134
+ table_data = data[0]
135
+ else:
136
+ table_data = data
137
+
138
+ # Extract table information
139
+ properties = table_data.get("properties", {})
140
+ table_x = table_data.get("x", 0)
141
+ table_y = table_data.get("y", 0)
142
+ table_width = table_data.get("width", properties.get("width", 0))
143
+ table_height = table_data.get("height", properties.get("height", 0))
144
+
145
+ # Create XML root structure
146
+ root = ET.Element("document", filename=filename)
147
+ table_elem = ET.SubElement(root, "table")
148
+
149
+ # Add table coordinates (rectangle points)
150
+ x1, y1 = int(table_x), int(table_y)
151
+ x2, y2 = int(table_x + table_width), int(table_y + table_height)
152
+ table_coords = f"{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}"
153
+ ET.SubElement(table_elem, "Coords", points=table_coords)
154
+
155
+ # Get cell coordinates and data
156
+ cell_coords = get_visible_cell_coords(properties, table_x, table_y)
157
+ cell_data = properties.get("cellData", {})
158
+ merged_cells = properties.get("mergedCells", {})
159
+
160
+ # Create XML elements for each visible cell
161
+ for (row, col), coords in cell_coords.items():
162
+ cell_key = f"{row}-{col}"
163
+ current_cell_data = cell_data.get(cell_key, {})
164
+
165
+ # Calculate end positions for merged cells
166
+ if cell_key in merged_cells:
167
+ merge_info = merged_cells[cell_key]
168
+ end_row = row + merge_info.get("rowspan", 1) - 1
169
+ end_col = col + merge_info.get("colspan", 1) - 1
170
+ else:
171
+ end_row = row
172
+ end_col = col
173
+
174
+ # Create cell XML element
175
+ cell_elem = ET.SubElement(table_elem, "cell")
176
+ cell_elem.set("start-row", str(row))
177
+ cell_elem.set("end-row", str(end_row))
178
+ cell_elem.set("start-col", str(col))
179
+ cell_elem.set("end-col", str(end_col))
180
+
181
+ # Add cell coordinates
182
+ x1 = int(coords["x"])
183
+ y1 = int(coords["y"])
184
+ x2 = int(coords["x"] + coords["width"])
185
+ y2 = int(coords["y"] + coords["height"])
186
+ cell_coord_str = f"{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}"
187
+ ET.SubElement(cell_elem, "Coords", points=cell_coord_str)
188
+
189
+ # Add border information
190
+ top, bottom, left, right = get_cell_borders(current_cell_data, properties)
191
+ ET.SubElement(cell_elem, "Lines",
192
+ top=str(top),
193
+ bottom=str(bottom),
194
+ left=str(left),
195
+ right=str(right))
196
+
197
+ return root
198
+
199
+ def save_xml_to_file(xml_root, output_path):
200
+ """Save XML to file with pretty formatting"""
201
+ # Convert to pretty-formatted string
202
+ rough_string = ET.tostring(xml_root, encoding='unicode')
203
+ reparsed = minidom.parseString(rough_string)
204
+ pretty_xml = reparsed.toprettyxml(indent=" ")
205
+
206
+ # Clean up extra whitespace lines
207
+ lines = [line for line in pretty_xml.split('\n') if line.strip()]
208
+ pretty_xml = '\n'.join(lines)
209
+
210
+ # Write to file
211
+ with open(output_path, 'w', encoding='utf-8') as f:
212
+ f.write(pretty_xml)
213
+
214
+ def convert_json_file_to_xml(json_file_path, xml_file_path, filename="table.jpg"):
215
+ """Convert JSON file to XML file"""
216
+ try:
217
+ # Read JSON file
218
+ with open(json_file_path, 'r', encoding='utf-8') as f:
219
+ json_data = json.load(f)
220
+ json_data = json_data.get('items')
221
+ # Convert to XML
222
+ xml_root = convert_json_to_xml(json_data, filename)
223
+
224
+ # Save XML file
225
+ save_xml_to_file(xml_root, xml_file_path)
226
+
227
+ print(f"✅ Successfully converted {json_file_path} to {xml_file_path}")
228
+ return True
229
+
230
+ except Exception as e:
231
+ print(f"❌ Error converting file: {e}")
232
+ return False
233
+
234
+ # Example usage and testing
235
+ if __name__ == "__main__":
236
+
237
+ import os
238
+ folder = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/test_json"
239
+ for name in os.listdir(folder):
240
+ if name.endswith('json'):
241
+ json_name = os.path.join(folder, name)
242
+ xml_name = name.replace('.json' , '.xml')
243
+ convert_json_file_to_xml(json_name,xml_name, xml_name.replace('.xml','.png'))
rename.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ folder_json = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/40_page_70_110925"
3
+
4
+ for name in os.listdir(folder_json):
5
+ if name.endswith('.png'):
6
+ new_name = name.split('-')[0] + '.png'
7
+ os.rename(os.path.join(folder_json, name), os.path.join(folder_json, new_name))
8
+ elif name.endswith('.json'):
9
+ new_name = name.split('-')[1] + '.json'
10
+ os.rename(os.path.join(folder_json, name), os.path.join(folder_json, new_name))
resize.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import os
3
+
4
+ # Input and output folders
5
+ input_folder = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/page_39"
6
+ output_folder = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/train_table_1209"
7
+
8
+ os.makedirs(output_folder, exist_ok=True)
9
+
10
+ # Loop through all files in the folder
11
+ for filename in os.listdir(input_folder):
12
+ file_path = os.path.join(input_folder, filename)
13
+
14
+ # Skip non-image files
15
+ if not filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
16
+ continue
17
+
18
+ # Open and resize
19
+ with Image.open(file_path) as img:
20
+ w, h = img.size
21
+ resized = img.resize((w // 2, h // 2), Image.LANCZOS)
22
+
23
+ # Save to output folder
24
+ resized.save(os.path.join(output_folder, filename))
25
+
26
+ print("✅ Done! All images resized to half.")
vis_json_cell.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import cv2
3
+ import os
4
+
5
+ def get_visible_cell_coords(table_properties, table_x, table_y):
6
+ rows = table_properties.get("rows", 0)
7
+ columns = table_properties.get("columns", 0)
8
+ column_widths = table_properties.get("columnWidths", {})
9
+ row_heights = table_properties.get("rowHeights", {})
10
+ merged_cells = table_properties.get("mergedCells", {})
11
+ hidden_cells = table_properties.get("hiddenCells", {})
12
+
13
+ DEFAULT_WIDTH = 100
14
+ DEFAULT_HEIGHT = 30
15
+
16
+ def get_col_width(col):
17
+ return column_widths.get(str(col), DEFAULT_WIDTH)
18
+
19
+ def get_row_height(row):
20
+ return row_heights.get(str(row), DEFAULT_HEIGHT)
21
+
22
+ # Convert merge cell coordinates to set of all spanned cells (excluding top-left)
23
+ merged_spanned_cells = set()
24
+ for key, merge_info in merged_cells.items():
25
+ base_row, base_col = map(int, key.split('-'))
26
+ rowspan = merge_info.get('rowspan', 1)
27
+ colspan = merge_info.get('colspan', 1)
28
+ for r in range(base_row, base_row + rowspan):
29
+ for c in range(base_col, base_col + colspan):
30
+ if (r, c) != (base_row, base_col):
31
+ merged_spanned_cells.add((r, c))
32
+
33
+ result = {}
34
+
35
+ for row in range(rows):
36
+ for col in range(columns):
37
+ coord_key = f"{row}-{col}"
38
+ if hidden_cells.get(coord_key):
39
+ continue # Skip hidden cells
40
+ if (row, col) in merged_spanned_cells:
41
+ continue # Skip cells covered by merged cells
42
+
43
+ # Calculate x by summing widths of all previous columns
44
+ x = sum(get_col_width(c) for c in range(col))
45
+ y = sum(get_row_height(r) for r in range(row))
46
+
47
+ # Check if it's a merged cell origin
48
+ if coord_key in merged_cells:
49
+ colspan = merged_cells[coord_key].get("colspan", 1)
50
+ rowspan = merged_cells[coord_key].get("rowspan", 1)
51
+ else:
52
+ colspan = 1
53
+ rowspan = 1
54
+
55
+ width = sum(get_col_width(c) for c in range(col, col + colspan))
56
+ height = sum(get_row_height(r) for r in range(row, row + rowspan))
57
+
58
+ result[(row, col)] = {
59
+ "x": int(x + table_x),
60
+ "y": int(y + table_y),
61
+ "width": int(width),
62
+ "height": int(height)
63
+ }
64
+
65
+ return result
66
+
67
+ folder_path = "/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/train_table_1209"
68
+ save_folder = "cell_vis"
69
+ os.makedirs(save_folder, exist_ok=True)
70
+ for name in os.listdir(folder_path):
71
+ if name.endswith("json"):
72
+ json_file = os.path.join(folder_path, name)
73
+ with open(json_file, "r") as f:
74
+ data = json.load(f)
75
+
76
+ img_name = name.split('.')[0] + ".png"
77
+ print(img_name)
78
+ image = cv2.imread(os.path.join(folder_path,img_name))
79
+ items = data.get('items')
80
+ for index in range(len(items)):
81
+
82
+ table = items[index]
83
+ if table.get('type') != 'table':
84
+ continue
85
+ table_x = table.get('x')
86
+ table_y = table.get('y')
87
+
88
+ table_prob = table.get('properties')
89
+ boxes = get_visible_cell_coords(table_prob, table_x, table_y)
90
+
91
+ for key, box in boxes.items():
92
+ row, col = key
93
+ x, y, w, h = int(box["x"]), int(box["y"]), int(box["width"]), int(box["height"])
94
+
95
+ top_left = (x, y)
96
+ bottom_right = (x + w, y + h)
97
+
98
+ cv2.rectangle(image, top_left, bottom_right, (0, 255, 0), 2)
99
+ # cv2.putText(image, f"{row},{col}", (x + 5, y + 20),
100
+ # cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
101
+
102
+ # Save or show
103
+ cv2.imwrite(os.path.join(save_folder, img_name), image)