kiaisoft commited on
Commit
7804d7a
·
verified ·
1 Parent(s): 66180d7

Upload xml2json.py

Browse files
Files changed (1) hide show
  1. xml2json.py +168 -0
xml2json.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import xml.etree.ElementTree as ET
2
+ import json
3
+ import os
4
+ from typing import List, Dict, Any
5
+
6
+ def parse_coords(coords_str: str) -> List[List[float]]:
7
+ """
8
+ Convert coordinates string "x1,y1 x2,y2 x3,y3 x4,y4" to LabelMe polygon format
9
+ """
10
+ points = []
11
+ coord_pairs = coords_str.strip().split()
12
+
13
+ for pair in coord_pairs:
14
+ x, y = pair.split(',')
15
+ points.append([float(x), float(y)])
16
+
17
+ return points
18
+
19
+ def xml_to_labelme(xml_file_path: str, output_dir: str = None) -> Dict[str, Any]:
20
+ """
21
+ Convert XML table annotation to LabelMe JSON format
22
+
23
+ Args:
24
+ xml_file_path: Path to input XML file
25
+ output_dir: Output directory for JSON file (optional)
26
+
27
+ Returns:
28
+ Dictionary containing LabelMe format data
29
+ """
30
+
31
+ # Parse XML
32
+ try:
33
+ tree = ET.parse(xml_file_path)
34
+ root = tree.getroot()
35
+ except ET.ParseError as e:
36
+ raise ValueError(f"Invalid XML format: {e}")
37
+
38
+ # Get image filename from XML
39
+ image_filename = root.get('filename', 'image.jpg')
40
+
41
+ # Initialize LabelMe structure
42
+ labelme_data = {
43
+ "version": "5.0.1",
44
+ "flags": {},
45
+ "shapes": [],
46
+ "imagePath": image_filename,
47
+ "imageData": None,
48
+ "imageHeight": 0, # Will be updated if we can get image dimensions
49
+ "imageWidth": 0 # Will be updated if we can get image dimensions
50
+ }
51
+
52
+ # Process all table elements (can be multiple tables in one XML)
53
+ tables = root.findall('table')
54
+ table_count = 0
55
+ cell_count = 0
56
+
57
+ for table_idx, table in enumerate(tables):
58
+ # Add table shape
59
+ table_coords = table.find('Coords')
60
+ if table_coords is not None:
61
+ points_str = table_coords.get('points')
62
+ if points_str:
63
+ table_points = parse_coords(points_str)
64
+
65
+ table_shape = {
66
+ "label": "table",
67
+ "points": table_points,
68
+ "group_id": f"table_{table_idx}", # Group ID to identify which table
69
+ "shape_type": "polygon",
70
+ "flags": {},
71
+ "description": f"Table {table_idx + 1}"
72
+ }
73
+ labelme_data["shapes"].append(table_shape)
74
+ table_count += 1
75
+
76
+ # Process all cells in this table
77
+ cells = table.findall('cell')
78
+ for cell_idx, cell in enumerate(cells):
79
+ cell_coords = cell.find('Coords')
80
+ if cell_coords is not None:
81
+ points_str = cell_coords.get('points')
82
+ if points_str:
83
+ cell_points = parse_coords(points_str)
84
+
85
+ # Get cell attributes for additional info
86
+ start_row = cell.get('start-row', '0')
87
+ end_row = cell.get('end-row', '0')
88
+ start_col = cell.get('start-col', '0')
89
+ end_col = cell.get('end-col', '0')
90
+
91
+ cell_shape = {
92
+ "label": "cell",
93
+ "points": cell_points,
94
+ "group_id": f"table_{table_idx}", # Same group ID as parent table
95
+ "shape_type": "polygon",
96
+ "flags": {},
97
+ "description": f"Table {table_idx + 1} - Row:{start_row}-{end_row}, Col:{start_col}-{end_col}"
98
+ }
99
+ labelme_data["shapes"].append(cell_shape)
100
+ cell_count += 1
101
+
102
+ # Try to estimate image dimensions from coordinates
103
+ all_x = []
104
+ all_y = []
105
+ for shape in labelme_data["shapes"]:
106
+ for point in shape["points"]:
107
+ all_x.append(point[0])
108
+ all_y.append(point[1])
109
+
110
+ if all_x and all_y:
111
+ labelme_data["imageWidth"] = int(max(all_x)) + 50 # Add some padding
112
+ labelme_data["imageHeight"] = int(max(all_y)) + 50 # Add some padding
113
+
114
+ # Save to JSON file
115
+ if output_dir:
116
+ os.makedirs(output_dir, exist_ok=True)
117
+
118
+ # Create output filename
119
+ base_name = os.path.splitext(os.path.basename(xml_file_path))[0]
120
+ json_filename = f"{base_name}.json"
121
+ json_path = os.path.join(output_dir, json_filename)
122
+
123
+ with open(json_path, 'w', encoding='utf-8') as f:
124
+ json.dump(labelme_data, f, indent=2, ensure_ascii=False)
125
+
126
+ print(f"Converted successfully! Output saved to: {json_path}")
127
+ print(f"Found {len(labelme_data['shapes'])} shapes total:")
128
+ print(f" - Tables: {table_count}")
129
+ print(f" - Cells: {cell_count}")
130
+ if table_count > 0:
131
+ print(f" - Average cells per table: {cell_count / table_count:.1f}")
132
+
133
+ return labelme_data
134
+
135
+ def batch_convert(input_dir: str, output_dir: str):
136
+ """
137
+ Convert all XML files in a directory to LabelMe JSON format
138
+
139
+ Args:
140
+ input_dir: Directory containing XML files
141
+ output_dir: Directory to save JSON files
142
+ """
143
+
144
+ if not os.path.exists(input_dir):
145
+ raise ValueError(f"Input directory does not exist: {input_dir}")
146
+
147
+ xml_files = [f for f in os.listdir(input_dir) if f.endswith('.xml')]
148
+
149
+ if not xml_files:
150
+ print(f"No XML files found in {input_dir}")
151
+ return
152
+
153
+ print(f"Found {len(xml_files)} XML files to convert...")
154
+
155
+ success_count = 0
156
+ for xml_file in xml_files:
157
+ try:
158
+ xml_path = os.path.join(input_dir, xml_file)
159
+ xml_to_labelme(xml_path, output_dir)
160
+ success_count += 1
161
+ except Exception as e:
162
+ print(f"Error converting {xml_file}: {e}")
163
+
164
+ print(f"\nConversion completed! Successfully converted {success_count}/{len(xml_files)} files.")
165
+
166
+ # Example usage
167
+ if __name__ == "__main__":
168
+ batch_convert('/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/src/train_trace_page39', '/Users/tuvn18/Desktop/tuvn18/dev/KIAI/dev/trace/src/train_trace_page39')