Suhasdev commited on
Commit
ba6e49b
·
0 Parent(s):

Refactor XML Cleaner with dependency injection and MinHash-based similarity matching

Browse files
Files changed (7) hide show
  1. .gitattributes +35 -0
  2. .gitignore +15 -0
  3. README.md +40 -0
  4. app.py +241 -0
  5. core_cleaner.py +514 -0
  6. ocr_strategies.py +54 -0
  7. visualizer.py +188 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+ .Python
6
+ venv/
7
+ env/
8
+ ENV/
9
+ .venv
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+ .DS_Store
14
+ *.log
15
+
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Intelligent XML Cleaner
3
+ emoji: 🌳
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 6.1.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+ # Intelligent XML Cleaner & Visualizer
12
+
13
+ This tool helps Android developers and QA engineers clean stale accessibility node information from UI XML dumps.
14
+
15
+ ## Features
16
+
17
+ * **Active-Based Sibling Pruning:** Intelligently removes XML nodes that are not visible on the screen based on OCR analysis or manual text input.
18
+ * **Flexible Text Input:** Optionally provide visible text manually, or use OCR for automatic extraction.
19
+ * **Dual OCR Strategy:** Choose between **EasyOCR** (Deep Learning based, high accuracy) or **Tesseract** (Fast, standard) as fallback when manual text is not provided.
20
+ * **Comprehensive Visualization:**
21
+ * **Tree View:** See the hierarchical structure of your XML before and after cleaning.
22
+ * **Screen View:** Visual confirmation of bounding boxes overlaid on the original screenshot.
23
+
24
+ ## How to use
25
+
26
+ 1. Upload the Screenshot of the app state.
27
+ 2. Upload the corresponding XML dump (from `uiautomator`).
28
+ 3. **(Optional)** Enter visible text from the screenshot manually (one per line or comma-separated). If left empty, OCR will be used automatically.
29
+ 4. Select your preferred OCR engine (only used if visible text is not provided).
30
+ 5. Click **Process**.
31
+ 6. View the comparisons in the tabs and download the cleaned XML.
32
+
33
+ ## Technical Details
34
+
35
+ This application uses a sophisticated pipeline:
36
+
37
+ 1. **Text Extraction:** Uses provided visible text (if available) or extracts visible text from the image using OCR.
38
+ 2. **LCA Calculation:** Finds the Lowest Common Ancestor of all active elements.
39
+ 3. **Pruning:** Traverses upward from the Active LCA and prunes siblings that contain no visible text.
40
+
app.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ from pathlib import Path
6
+ import time
7
+
8
+ # Import our modules
9
+ from ocr_strategies import OCRFactory
10
+ from core_cleaner import XMLCleanerCore
11
+ from visualizer import XMLTreeVisualizer, BoundingBoxVisualizer
12
+
13
+ # Initialize Logic Classes
14
+ cleaner_core = XMLCleanerCore()
15
+ tree_viz = XMLTreeVisualizer()
16
+ bbox_viz = BoundingBoxVisualizer()
17
+
18
+ def process_pipeline(image_file, xml_file, ocr_choice, visible_text_input, progress=gr.Progress()):
19
+ # 1. Validation
20
+ if xml_file is None:
21
+ raise gr.Error("Please upload XML file.")
22
+
23
+ # Check if we need image (only if visible text is not provided)
24
+ use_ocr = not (visible_text_input and visible_text_input.strip())
25
+ if use_ocr and image_file is None:
26
+ raise gr.Error("Please upload Image file when using OCR, or provide visible text manually.")
27
+
28
+ start_time = time.time()
29
+
30
+ # 2. Setup Paths (Safe Temp Files)
31
+ temp_dir = Path(tempfile.gettempdir())
32
+ unique_id = str(int(time.time()))
33
+
34
+ # Paths for outputs
35
+ cleaned_xml_path = temp_dir / f"cleaned_{unique_id}.xml"
36
+
37
+ img_viz_before = temp_dir / f"bbox_before_{unique_id}.png"
38
+ img_viz_after = temp_dir / f"bbox_after_{unique_id}.png"
39
+ tree_viz_before = temp_dir / f"tree_before_{unique_id}.png"
40
+ tree_viz_after = temp_dir / f"tree_after_{unique_id}.png"
41
+
42
+ # 3. Text Extraction Stage (OCR or Manual Input)
43
+ text_source = None
44
+ if visible_text_input and visible_text_input.strip():
45
+ # Use provided visible text - NO OCR NEEDED
46
+ progress(0.2, desc="Using provided visible text (OCR skipped)...")
47
+ # Convert input text to set of strings (split by newlines or commas)
48
+ lines = [line.strip() for line in visible_text_input.replace(',', '\n').split('\n') if line.strip()]
49
+ visible_text = {line.lower().strip() for line in lines if line.strip()}
50
+ text_source = "Manual Input"
51
+ else:
52
+ # Use OCR - image is required here
53
+ progress(0.2, desc="Running OCR on image...")
54
+ ocr_engine = OCRFactory.get_strategy(ocr_choice)
55
+ visible_text = ocr_engine.extract_text(image_file)
56
+ text_source = ocr_choice
57
+
58
+ # 4. XML Parsing & Detection
59
+ progress(0.4, desc="Parsing XML...")
60
+ tree, root, parent_map = cleaner_core.parse_xml(xml_file)
61
+
62
+ progress(0.5, desc="Detecting Stale Elements...")
63
+ active, stale = cleaner_core.find_active_and_stale(root, visible_text)
64
+
65
+ # 5. Pruning
66
+ progress(0.6, desc="Pruning Tree...")
67
+ removed_count = 0
68
+ if stale:
69
+ removed_count = cleaner_core.prune_stale_subtrees(root, active, stale, parent_map)
70
+
71
+ # Save Cleaned XML
72
+ tree.write(str(cleaned_xml_path))
73
+
74
+ # 6. Visualization Generation
75
+ progress(0.7, desc="Generating Visualizations...")
76
+
77
+ # Bounding Boxes (only if image is provided)
78
+ if image_file is not None:
79
+ bbox_viz.visualize(image_file, xml_file, str(img_viz_before))
80
+ bbox_viz.visualize(image_file, str(cleaned_xml_path), str(img_viz_after))
81
+ else:
82
+ # Create placeholder images or skip
83
+ img_viz_before = None
84
+ img_viz_after = None
85
+
86
+ # Trees
87
+ progress(0.8, desc="Drawing Trees (This might take a moment)...")
88
+ # Before: no highlights
89
+ tree_viz.visualize(xml_file, str(tree_viz_before), visible_text=None, active_elements=None)
90
+ # After: highlight active elements (OCR matched nodes)
91
+ active_elements_set = set(active) if active else set()
92
+ tree_viz.visualize(str(cleaned_xml_path), str(tree_viz_after), visible_text, active_elements_set)
93
+
94
+ # 7. Stats
95
+ total_time = time.time() - start_time
96
+ stats_md = f"""
97
+ ### 📊 Process Statistics
98
+
99
+ | Metric | Result |
100
+ | :--- | :--- |
101
+ | **Text Source** | {text_source} |
102
+ | **Elements Removed** | `{removed_count}` |
103
+ | **Active Elements** | `{len(active)}` |
104
+ | **Stale Elements** | `{len(stale)}` |
105
+ | **Processing Time** | `{total_time:.2f}s` |
106
+ """
107
+
108
+ ocr_text_display = "\n".join(sorted(list(visible_text)))
109
+
110
+ progress(1.0, desc="Done!")
111
+
112
+ return (
113
+ str(tree_viz_before),
114
+ str(tree_viz_after),
115
+ str(img_viz_before) if img_viz_before else None,
116
+ str(img_viz_after) if img_viz_after else None,
117
+ stats_md,
118
+ ocr_text_display,
119
+ str(cleaned_xml_path)
120
+ )
121
+
122
+ # --- Gradio UI Layout ---
123
+ custom_css = """
124
+ .container { max-width: 1100px; margin: auto; }
125
+ .header { text-align: center; margin-bottom: 20px; }
126
+ .stat-box { border: 1px solid #ddd; padding: 10px; border-radius: 8px; background: #f9f9f9; }
127
+ """
128
+
129
+ with gr.Blocks() as app:
130
+
131
+ with gr.Row():
132
+ gr.Markdown(
133
+ """
134
+ # 🌳 XML Cleaner & Visualizer Studio
135
+ **Optimize Mobile UI XMLs** by removing invisible/stale nodes using OCR-based or manual text input for sibling pruning.
136
+ """,
137
+ elem_classes="header"
138
+ )
139
+
140
+ with gr.Row():
141
+ # --- Left Panel: Inputs ---
142
+ with gr.Column(scale=1, variant="panel"):
143
+ gr.Markdown("### 1. Upload Data")
144
+ img_input = gr.Image(type="filepath", label="Screenshot (PNG/JPG)")
145
+ gr.Markdown("*Optional if visible text is provided below*")
146
+ xml_input = gr.File(label="XML Layout Dump", file_types=[".xml"])
147
+
148
+ gr.Markdown("### 2. Visible Text (Optional)")
149
+ visible_text_input = gr.TextArea(
150
+ label="Visible Text",
151
+ placeholder="Enter visible text from the screenshot (one per line or comma-separated). Leave empty to use OCR.",
152
+ lines=5,
153
+ info="If provided, this text will be used instead of OCR. Otherwise, OCR will be used automatically."
154
+ )
155
+
156
+ # Status indicator for text input mode
157
+ text_input_status = gr.Markdown("", visible=False)
158
+
159
+ gr.Markdown("### 3. Settings")
160
+ ocr_selector = gr.Dropdown(
161
+ choices=["EasyOCR (Best Accuracy)", "Tesseract (Fast & Free)"],
162
+ value="EasyOCR (Best Accuracy)",
163
+ label="OCR Engine (Fallback)",
164
+ info="Used only if visible text is not provided above.",
165
+ interactive=True
166
+ )
167
+
168
+ btn_run = gr.Button("✨ Run Analysis & Clean", variant="primary", size="lg")
169
+
170
+ # --- Right Panel: Outputs ---
171
+ with gr.Column(scale=2):
172
+ gr.Markdown("### 4. Analysis Results")
173
+
174
+ # Stats Area
175
+ stats_output = gr.Markdown()
176
+
177
+ # Visualization Tabs
178
+ with gr.Tabs():
179
+ with gr.TabItem("🌳 Tree Structure"):
180
+ gr.Markdown("*Left: Original XML | Right: Cleaned XML (Active Nodes Highlighted)*")
181
+ with gr.Row():
182
+ out_tree_before = gr.Image(label="Before Pruning", type="filepath")
183
+ out_tree_after = gr.Image(label="After Pruning", type="filepath")
184
+
185
+ with gr.TabItem("🖼️ Bounding Boxes"):
186
+ gr.Markdown("*Visualizing XML bounds on the screenshot*")
187
+ with gr.Row():
188
+ out_bbox_before = gr.Image(label="Original Bounds", type="filepath")
189
+ out_bbox_after = gr.Image(label="Cleaned Bounds", type="filepath")
190
+
191
+ with gr.TabItem("📝 OCR Data"):
192
+ out_ocr_text = gr.TextArea(label="Detected Text", lines=10, interactive=False)
193
+
194
+ # Download
195
+ gr.Markdown("### 5. Export")
196
+ out_file = gr.File(label="Download Cleaned XML")
197
+
198
+ # Function to toggle OCR selector and image input based on visible text input
199
+ def toggle_ocr_selector(visible_text):
200
+ """Disable OCR selector if visible text is provided, enable if empty"""
201
+ if visible_text and visible_text.strip():
202
+ return (
203
+ gr.update(
204
+ label="OCR Engine (Disabled - Using Manual Text)",
205
+ info="⚠️ OCR is disabled because visible text is provided above.",
206
+ interactive=False
207
+ ),
208
+ gr.update(value="✅ **Using Manual Text Input** - OCR is disabled. Image is optional.", visible=True),
209
+ gr.update(label="Screenshot (PNG/JPG) - Optional")
210
+ )
211
+ else:
212
+ return (
213
+ gr.update(
214
+ label="OCR Engine",
215
+ info="Select OCR engine to extract visible text from the screenshot.",
216
+ interactive=True
217
+ ),
218
+ gr.update(value="", visible=False),
219
+ gr.update(label="Screenshot (PNG/JPG) - Required")
220
+ )
221
+
222
+ # Wire Interactions
223
+ # Update OCR selector and image input when visible text changes
224
+ visible_text_input.change(
225
+ fn=toggle_ocr_selector,
226
+ inputs=[visible_text_input],
227
+ outputs=[ocr_selector, text_input_status, img_input]
228
+ )
229
+
230
+ btn_run.click(
231
+ fn=process_pipeline,
232
+ inputs=[img_input, xml_input, ocr_selector, visible_text_input],
233
+ outputs=[
234
+ out_tree_before, out_tree_after,
235
+ out_bbox_before, out_bbox_after,
236
+ stats_output, out_ocr_text, out_file
237
+ ]
238
+ )
239
+
240
+ if __name__ == "__main__":
241
+ app.launch(css=custom_css, theme=gr.themes.Soft())
core_cleaner.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Refactored XML Cleaner with dependency injection.
3
+ """
4
+
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Dict, List, Tuple, Set, Optional
8
+ from logging import getLogger
9
+ import xml.etree.ElementTree as ET
10
+ import asyncio
11
+ import re
12
+ from datasketch import MinHash
13
+
14
+ from ocr_strategies import OCRStrategy
15
+ from functools import wraps
16
+
17
+ logger = getLogger(__name__)
18
+
19
+
20
+ # Simple decorators for compatibility (can be enhanced later)
21
+ def profile_it(func_name: str = "", tags: dict = None):
22
+ """Simple profiling decorator placeholder"""
23
+ def decorator(func):
24
+ @wraps(func)
25
+ async def wrapper(*args, **kwargs):
26
+ return await func(*args, **kwargs)
27
+ return wrapper
28
+ return decorator
29
+
30
+
31
+ def json_log():
32
+ """Simple logging decorator placeholder"""
33
+ def decorator(func):
34
+ @wraps(func)
35
+ async def wrapper(*args, **kwargs):
36
+ return await func(*args, **kwargs)
37
+ return wrapper
38
+ return decorator
39
+
40
+
41
+ # Adapter to make OCRStrategy async-compatible
42
+ class BaseOCR:
43
+ """Adapter to make OCRStrategy work with async interface"""
44
+
45
+ def __init__(self, ocr_strategy: OCRStrategy):
46
+ self._strategy = ocr_strategy
47
+
48
+ async def extract_visible_text(self, image_path: Path) -> Set[str]:
49
+ """Extract visible text asynchronously"""
50
+ def _extract():
51
+ return self._strategy.extract_text(str(image_path))
52
+ return await asyncio.to_thread(_extract)
53
+
54
+
55
+ class DataLoader:
56
+ """Loads and validates image and XML inputs asynchronously"""
57
+
58
+ async def load_inputs(self, image_path: str, xml_path: str) -> Tuple[Path, ET.ElementTree]:
59
+ img_path = Path(image_path)
60
+ xml_file = Path(xml_path)
61
+
62
+ if not img_path.exists():
63
+ raise FileNotFoundError(f"Image not found: {image_path}")
64
+ if not xml_file.exists():
65
+ raise FileNotFoundError(f"XML not found: {xml_path}")
66
+
67
+ def _parse_xml():
68
+ return ET.parse(xml_file)
69
+
70
+ tree = await asyncio.to_thread(_parse_xml)
71
+ return img_path, tree
72
+
73
+
74
+ class XMLParser:
75
+ """Parses XML and builds parent-child relationships"""
76
+
77
+ def parse_xml(self, tree: ET.ElementTree) -> Tuple[ET.Element, Dict[ET.Element, ET.Element]]:
78
+ root = tree.getroot()
79
+ parent_map = {child: parent for parent in root.iter() for child in parent}
80
+ return root, parent_map
81
+
82
+
83
+ class StaleDetector:
84
+ """
85
+ Identifies active and stale elements by comparing XML with OCR results.
86
+
87
+ Logic:
88
+ - Active elements: XML elements whose text matches OCR results (visible on screen)
89
+ - Stale elements: XML elements whose text does NOT match OCR results (not visible)
90
+ """
91
+
92
+ def find_active_and_stale(
93
+ self,
94
+ root: ET.Element,
95
+ visible_text: Set[str]
96
+ ) -> Tuple[List[ET.Element], List[ET.Element]]:
97
+ """
98
+ Compare all XML elements with OCR results to classify as active or stale.
99
+
100
+ Args:
101
+ root: Root element of the XML tree to process
102
+ visible_text: Set of text strings extracted from OCR (visible on screen)
103
+
104
+ Returns:
105
+ Tuple containing:
106
+ - List of active elements (text matches OCR results)
107
+ - List of stale elements (text does not match OCR results)
108
+ """
109
+ active_elements = []
110
+ stale_elements = []
111
+ filtered_ocr = {t for t in visible_text if len(t) > 2}
112
+
113
+ for elem in root.iter():
114
+ text = elem.get('text', '').lower().strip()
115
+
116
+ if text and len(text) > 2:
117
+
118
+ if self._is_similar(text, filtered_ocr):
119
+ active_elements.append(elem)
120
+ else:
121
+ stale_elements.append(elem)
122
+
123
+ return active_elements, stale_elements
124
+
125
+ def _is_similar(self, elem_text: str, visible_text: Set[str]) -> bool:
126
+ """
127
+ Check if text matches any visible text using MinHash-based Jaccard similarity.
128
+
129
+ Uses MinHash algorithm for efficient similarity comparison:
130
+ 1. Tokenizes element text and OCR text into word tokens
131
+ 2. Creates MinHash signatures for both texts
132
+ 3. Calculates Jaccard similarity between MinHashes
133
+ 4. Returns True if similarity >= 50%
134
+
135
+ MinHash provides better performance and accuracy for text similarity
136
+ compared to simple token overlap, especially for longer texts.
137
+
138
+ Args:
139
+ elem_text: Text from XML element
140
+ visible_text: Set of text strings from OCR results
141
+
142
+ Returns:
143
+ True if element text semantically matches any OCR text, False otherwise
144
+ """
145
+ # Tokenize element text into word tokens
146
+ elem_tokens = re.findall(r'\w+', elem_text.lower())
147
+
148
+ # Create MinHash for element text
149
+ if not elem_tokens:
150
+ return False
151
+
152
+ elem_minhash = MinHash(num_perm=32)
153
+ for token in elem_tokens:
154
+ elem_minhash.update(token.encode())
155
+
156
+ # Compare with each OCR text result using MinHash
157
+ for ocr_text in visible_text:
158
+ # Tokenize OCR text into word tokens
159
+ ocr_tokens = re.findall(r'\w+', ocr_text.lower())
160
+
161
+ if not ocr_tokens:
162
+ continue
163
+
164
+ # Create MinHash for OCR text
165
+ ocr_minhash = MinHash(num_perm=32)
166
+ for token in ocr_tokens:
167
+ ocr_minhash.update(token.encode())
168
+
169
+ # Calculate Jaccard similarity using MinHash
170
+ similarity = elem_minhash.jaccard(ocr_minhash)
171
+
172
+ # Match if similarity >= 50%
173
+ if similarity >= 0.5:
174
+ return True
175
+
176
+ return False
177
+
178
+
179
+ class LCAFinder:
180
+ """Finds lowest common ancestor of elements"""
181
+
182
+ def find_lca(
183
+ self,
184
+ elements: List[ET.Element],
185
+ parent_map: Dict[ET.Element, ET.Element]
186
+ ) -> Optional[ET.Element]:
187
+ """
188
+ Find the lowest common ancestor (LCA) of a list of XML elements.
189
+
190
+ Traverses up from each element to the root, finding the deepest node
191
+ that is an ancestor of all given elements.
192
+
193
+ Args:
194
+ elements: List of XML elements to find LCA for
195
+ parent_map: Dictionary mapping each element to its parent
196
+
197
+ Returns:
198
+ The lowest common ancestor element, or None if elements list is empty
199
+ """
200
+ if not elements:
201
+ return None
202
+
203
+ paths = [self._get_path_to_root(elem, parent_map) for elem in elements]
204
+ min_length = min(len(path) for path in paths)
205
+
206
+ lca = None
207
+ for i in range(min_length):
208
+ if len(set(path[i] for path in paths)) == 1:
209
+ lca = paths[0][i]
210
+ else:
211
+ break
212
+
213
+ return lca
214
+
215
+ def _get_path_to_root(
216
+ self,
217
+ elem: ET.Element,
218
+ parent_map: Dict[ET.Element, ET.Element]
219
+ ) -> List[ET.Element]:
220
+ path = []
221
+ current = elem
222
+
223
+ while current is not None:
224
+ path.append(current)
225
+ current = parent_map.get(current)
226
+
227
+ return list(reversed(path))
228
+
229
+
230
+ class ActiveBasedPruner:
231
+ """Prunes stale subtrees by traversing up from active LCA"""
232
+
233
+ def find_and_prune_stale_subtrees(
234
+ self,
235
+ root: ET.Element,
236
+ active_elements: List[ET.Element],
237
+ stale_elements: List[ET.Element],
238
+ parent_map: Dict[ET.Element, ET.Element]
239
+ ) -> int:
240
+ if not active_elements:
241
+ return 0
242
+
243
+ stale_set = set(stale_elements)
244
+ lca_finder = LCAFinder()
245
+ active_lca = lca_finder.find_lca(active_elements, parent_map)
246
+
247
+ if active_lca is None:
248
+ return 0
249
+
250
+ total_removed = 0
251
+ current = active_lca
252
+
253
+ while current is not None:
254
+ parent = parent_map.get(current)
255
+ if parent is None:
256
+ break
257
+
258
+ siblings = [child for child in parent if child != current]
259
+
260
+ for sibling in siblings:
261
+ if self._subtree_contains_stale(sibling, stale_set):
262
+ removed = len(list(sibling.iter()))
263
+ parent.remove(sibling)
264
+ total_removed += removed
265
+
266
+ current = parent
267
+
268
+ return total_removed
269
+
270
+ def _subtree_contains_stale(self, node: ET.Element, stale_set: Set[ET.Element]) -> bool:
271
+ for elem in node.iter():
272
+ if elem in stale_set:
273
+ return True
274
+ return False
275
+
276
+
277
+ class XMLWriter:
278
+ """Writes cleaned XML to file asynchronously"""
279
+
280
+ async def save_cleaned_xml(self, tree: ET.ElementTree, output_path: str) -> None:
281
+ def _write_xml():
282
+ tree.write(output_path, encoding='utf-8', xml_declaration=True)
283
+
284
+ await asyncio.to_thread(_write_xml)
285
+
286
+
287
+ class XMLCleaner:
288
+ """
289
+ XML Cleaner with injected OCR dependency.
290
+ Now testable and flexible!
291
+ """
292
+
293
+ def __init__(
294
+ self,
295
+ ocr: BaseOCR,
296
+ thread_code: str = ""
297
+ ):
298
+ """
299
+ Args:
300
+ ocr: OCR provider (BaseOCR adapter wrapping OCRStrategy)
301
+ thread_code: Thread code for logging
302
+ """
303
+ self._ocr = ocr
304
+ self._thread_code = thread_code
305
+
306
+ # Create instances of supporting classes
307
+ self._loader = DataLoader()
308
+ self._parser = XMLParser()
309
+ self._detector = StaleDetector()
310
+ self._pruner = ActiveBasedPruner()
311
+ self._writer = XMLWriter()
312
+
313
+ @json_log()
314
+ @profile_it(
315
+ func_name="xml_cleaner",
316
+ tags={"operation_type": "xml_processing", "workflow": ""}
317
+ )
318
+ async def clean(
319
+ self,
320
+ image_path: str,
321
+ xml_path: str,
322
+ output_path: str
323
+ ) -> Dict:
324
+ """
325
+ Main workflow - simplified interface.
326
+
327
+ Args:
328
+ image_path: Path to screenshot
329
+ xml_path: Path to XML dump
330
+ output_path: Path to save cleaned XML
331
+
332
+ Returns:
333
+ Statistics dict with detailed latency breakdown
334
+ """
335
+ total_start_time = time.perf_counter()
336
+
337
+ try:
338
+
339
+ load_start = time.perf_counter()
340
+ img_path, tree = await self._loader.load_inputs(image_path, xml_path)
341
+ load_latency = time.perf_counter() - load_start
342
+
343
+
344
+ ocr_start = time.perf_counter()
345
+ visible_text = await self._ocr.extract_visible_text(img_path)
346
+ ocr_latency = time.perf_counter() - ocr_start
347
+
348
+
349
+ parse_start = time.perf_counter()
350
+ root, parent_map = self._parser.parse_xml(tree)
351
+ total_elements = len(list(root.iter()))
352
+ parse_latency = time.perf_counter() - parse_start
353
+
354
+
355
+ detect_start = time.perf_counter()
356
+ active_elements, stale_elements = self._detector.find_active_and_stale(
357
+ root, visible_text
358
+ )
359
+ detect_latency = time.perf_counter() - detect_start
360
+
361
+ # Early exit if no stale elements
362
+ if not stale_elements:
363
+ total_latency = time.perf_counter() - total_start_time
364
+ return {
365
+ 'status': 'clean',
366
+ 'removed': 0,
367
+ 'total_elements': total_elements,
368
+ 'active_count': len(active_elements),
369
+ 'visible_text_count': len(visible_text),
370
+ 'load_latency': load_latency,
371
+ 'ocr_latency': ocr_latency,
372
+ 'parse_latency': parse_latency,
373
+ 'detection_latency': detect_latency,
374
+ 'total_latency': total_latency
375
+ }
376
+
377
+
378
+ prune_start = time.perf_counter()
379
+ removed = self._pruner.find_and_prune_stale_subtrees(
380
+ root, active_elements, stale_elements, parent_map
381
+ )
382
+ prune_latency = time.perf_counter() - prune_start
383
+
384
+
385
+ save_start = time.perf_counter()
386
+ await self._writer.save_cleaned_xml(tree, output_path)
387
+ save_latency = time.perf_counter() - save_start
388
+
389
+ total_latency = time.perf_counter() - total_start_time
390
+
391
+ logger.info(
392
+ f"XML cleaning completed: removed {removed}/{total_elements} elements "
393
+ f"in {total_latency:.2f}s"
394
+ )
395
+
396
+ return {
397
+ 'status': 'cleaned',
398
+ 'removed': removed,
399
+ 'total_elements': total_elements,
400
+ 'method': 'active_based_sibling_pruning',
401
+ 'active_count': len(active_elements),
402
+ 'stale_count': len(stale_elements),
403
+ 'visible_text_count': len(visible_text),
404
+ 'load_latency': load_latency,
405
+ 'ocr_latency': ocr_latency,
406
+ 'parse_latency': parse_latency,
407
+ 'detection_latency': detect_latency,
408
+ 'pruning_latency': prune_latency,
409
+ 'save_latency': save_latency,
410
+ 'total_latency': total_latency
411
+ }
412
+
413
+ except Exception as e:
414
+ logger.error(f"Error in XML cleaner: {e}", exc_info=True)
415
+ total_latency = time.perf_counter() - total_start_time
416
+ return {
417
+ 'status': 'error',
418
+ 'error': str(e),
419
+ 'total_latency': total_latency
420
+ }
421
+
422
+
423
+ # Backward compatibility: Keep the old class name
424
+ class XMLCleanerCore:
425
+ def __init__(self):
426
+ pass # Stateless, pure logic
427
+
428
+ def parse_xml(self, xml_path):
429
+ tree = ET.parse(xml_path)
430
+ root = tree.getroot()
431
+ parent_map = {c: p for p in root.iter() for c in p}
432
+ return tree, root, parent_map
433
+
434
+ def find_active_and_stale(self, root, visible_text: Set[str]):
435
+ active_elements = []
436
+ stale_elements = []
437
+
438
+ # Filter noise from OCR
439
+ clean_ocr = {t for t in visible_text if len(t) > 2}
440
+
441
+ for elem in root.iter():
442
+ text = elem.get('text', '').lower().strip()
443
+ if text and len(text) > 2:
444
+ if self._is_semantic_match(text, clean_ocr):
445
+ active_elements.append(elem)
446
+ else:
447
+ stale_elements.append(elem)
448
+ return active_elements, stale_elements
449
+
450
+ def _is_semantic_match(self, elem_text, visible_texts):
451
+ # Token based matching
452
+ elem_tokens = set(re.findall(r'\w+', elem_text))
453
+ for ocr in visible_texts:
454
+ ocr_tokens = set(re.findall(r'\w+', ocr))
455
+ if not elem_tokens: continue
456
+ overlap = len(elem_tokens & ocr_tokens)
457
+ if overlap / len(elem_tokens) >= 0.5: # 50% match
458
+ return True
459
+ return False
460
+
461
+ def prune_stale_subtrees(self, root, active_elements, stale_elements, parent_map):
462
+ if not active_elements: return 0
463
+
464
+ # 1. Find LCA of active elements
465
+ active_lca = self._find_lca(active_elements, parent_map)
466
+ if not active_lca: return 0
467
+
468
+ stale_set = set(stale_elements)
469
+ removed_count = 0
470
+ current = active_lca
471
+
472
+ # 2. Traverse Up and Prune Siblings
473
+ while current is not None:
474
+ parent = parent_map.get(current)
475
+ if not parent: break
476
+
477
+ siblings = [child for child in parent if child != current]
478
+ for sibling in siblings:
479
+ # If sibling tree has stale elements?
480
+ # Simplified: If sibling is strictly in stale list or contains them
481
+ if self._subtree_has_stale(sibling, stale_set):
482
+ removed_count += len(list(sibling.iter()))
483
+ parent.remove(sibling)
484
+
485
+ current = parent
486
+ return removed_count
487
+
488
+ def _subtree_has_stale(self, node, stale_set):
489
+ for x in node.iter():
490
+ if x in stale_set: return True
491
+ return False
492
+
493
+ def _find_lca(self, elements, parent_map):
494
+ # Get paths to root
495
+ paths = []
496
+ for el in elements:
497
+ path = []
498
+ curr = el
499
+ while curr:
500
+ path.append(curr)
501
+ curr = parent_map.get(curr)
502
+ paths.append(list(reversed(path)))
503
+
504
+ if not paths: return None
505
+
506
+ # Find common prefix
507
+ min_len = min(len(p) for p in paths)
508
+ lca = None
509
+ for i in range(min_len):
510
+ if len(set(p[i] for p in paths)) == 1:
511
+ lca = paths[0][i]
512
+ else:
513
+ break
514
+ return lca
ocr_strategies.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyocr
2
+ import pytesseract
3
+ from abc import ABC, abstractmethod
4
+ from typing import Set
5
+ import cv2
6
+ import numpy as np
7
+
8
+ class OCRStrategy(ABC):
9
+ """Abstract base class for OCR strategies."""
10
+ @abstractmethod
11
+ def extract_text(self, image_path: str) -> Set[str]:
12
+ pass
13
+
14
+ class EasyOCRStrategy(OCRStrategy):
15
+ """Concrete strategy for EasyOCR."""
16
+ def __init__(self):
17
+ # Initialize once to save memory/time
18
+ print("Loading EasyOCR Model...")
19
+ self.reader = easyocr.Reader(['en'], gpu=False)
20
+
21
+ def extract_text(self, image_path: str) -> Set[str]:
22
+ results = self.reader.readtext(image_path, detail=0)
23
+ return {text.lower().strip() for text in results if text.strip()}
24
+
25
+ class TesseractOCRStrategy(OCRStrategy):
26
+ """Concrete strategy for Tesseract OCR (Free & Fast)."""
27
+ def extract_text(self, image_path: str) -> Set[str]:
28
+ # Preprocessing for better Tesseract accuracy
29
+ img = cv2.imread(image_path)
30
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
31
+ # Apply thresholding
32
+ _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
33
+
34
+ # Tesseract execution
35
+ text = pytesseract.image_to_string(thresh)
36
+
37
+ # Process results
38
+ lines = text.split('\n')
39
+ return {line.lower().strip() for line in lines if line.strip()}
40
+
41
+ class OCRFactory:
42
+ """Factory to get the correct OCR strategy."""
43
+ _instances = {}
44
+
45
+ @staticmethod
46
+ def get_strategy(strategy_name: str) -> OCRStrategy:
47
+ if strategy_name not in OCRFactory._instances:
48
+ if strategy_name == "EasyOCR (Best Accuracy)":
49
+ OCRFactory._instances[strategy_name] = EasyOCRStrategy()
50
+ elif strategy_name == "Tesseract (Fast & Free)":
51
+ OCRFactory._instances[strategy_name] = TesseractOCRStrategy()
52
+ else:
53
+ raise ValueError(f"Unknown strategy: {strategy_name}")
54
+ return OCRFactory._instances[strategy_name]
visualizer.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ matplotlib.use('Agg') # Non-interactive backend for web apps
3
+ import matplotlib.pyplot as plt
4
+ from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
5
+ import xml.etree.ElementTree as ET
6
+ import cv2
7
+ import re
8
+ from typing import Dict, Set, Tuple
9
+
10
+ # ==========================================
11
+ # 1. Tree Visualizer (Refactored from your upload)
12
+ # ==========================================
13
+ class XMLTreeVisualizer:
14
+ # Single color scheme for all nodes
15
+ DEFAULT_COLOR = {'fill': '#E3F2FD', 'border': '#1976D2', 'text': '#000000'}
16
+ HIGHLIGHT_COLOR = {'fill': '#FFF59D', 'border': '#F57F17', 'text': '#000000'} # Yellow for active nodes
17
+
18
+ def visualize(self, xml_path: str, output_path: str, visible_text: Set[str] = None, active_elements: Set = None):
19
+ """Generates tree visualization. If visible_text and active_elements are provided, highlights active nodes."""
20
+ tree = ET.parse(xml_path)
21
+ root = tree.getroot()
22
+
23
+ # Calculate Layout
24
+ positions = self._calculate_layout(root)
25
+ max_depth = max(p['level'] for p in positions.values())
26
+
27
+ # Setup Figure with larger size for better text readability
28
+ fig, ax = plt.subplots(figsize=(24, 18))
29
+ ax.set_xlim(-len(positions)*0.5, len(positions)*0.5)
30
+ ax.set_ylim(-max_depth * 2 - 2, 2)
31
+ ax.axis('off')
32
+
33
+ # Draw Edges
34
+ self._draw_edges(ax, positions, root)
35
+
36
+ # Draw Nodes
37
+ self._draw_nodes(ax, positions, visible_text, active_elements)
38
+
39
+ plt.title("XML Tree Structure" + (" (Active Nodes Highlighted)" if active_elements else ""), fontsize=16)
40
+ plt.tight_layout()
41
+ plt.savefig(output_path, dpi=150, bbox_inches='tight')
42
+ plt.close(fig)
43
+
44
+ def _calculate_layout(self, root, x=0, y=0, level=0, spacing=2.5):
45
+ positions = {}
46
+
47
+ def _get_width(node):
48
+ children = list(node)
49
+ if not children: return 1.0
50
+ return sum(_get_width(c) for c in children)
51
+
52
+ def _assign_pos(node, curr_x, curr_y, curr_level):
53
+ positions[id(node)] = {'x': curr_x, 'y': curr_y, 'level': curr_level, 'element': node}
54
+ children = list(node)
55
+ if not children: return
56
+
57
+ width = sum(_get_width(c) for c in children)
58
+ start_x = curr_x - (width * spacing / 2)
59
+
60
+ current_offset = 0
61
+ for child in children:
62
+ child_w = _get_width(child)
63
+ child_x = start_x + (current_offset + child_w/2) * spacing
64
+ _assign_pos(child, child_x, curr_y - 2, curr_level + 1)
65
+ current_offset += child_w
66
+
67
+ _assign_pos(root, x, y, level)
68
+ return positions
69
+
70
+ def _draw_edges(self, ax, positions, node):
71
+ node_id = id(node)
72
+ if node_id not in positions: return
73
+ parent_pos = positions[node_id]
74
+
75
+ for child in node:
76
+ child_id = id(child)
77
+ if child_id in positions:
78
+ child_pos = positions[child_id]
79
+ arrow = FancyArrowPatch(
80
+ (parent_pos['x'], parent_pos['y']),
81
+ (child_pos['x'], child_pos['y']),
82
+ arrowstyle='-', color='#555', linewidth=1, zorder=1
83
+ )
84
+ ax.add_patch(arrow)
85
+ self._draw_edges(ax, positions, child)
86
+
87
+ def _draw_nodes(self, ax, positions, visible_text, active_elements):
88
+ for pid, info in positions.items():
89
+ elem = info['element']
90
+ text = elem.get('text', '').strip()
91
+
92
+ # Highlight Logic: Check if this element is in the active_elements set
93
+ is_highlight = False
94
+ if active_elements and elem in active_elements:
95
+ is_highlight = True
96
+
97
+ # Use single color scheme
98
+ if is_highlight:
99
+ face = self.HIGHLIGHT_COLOR['fill']
100
+ edge = self.HIGHLIGHT_COLOR['border']
101
+ lw = 3
102
+ else:
103
+ face = self.DEFAULT_COLOR['fill']
104
+ edge = self.DEFAULT_COLOR['border']
105
+ lw = 1
106
+
107
+ # Calculate box size based on text length
108
+ if text:
109
+ # Use actual text, wrap if too long
110
+ display_text = text
111
+ # Wrap text if longer than 20 characters
112
+ if len(display_text) > 20:
113
+ # Try to break at word boundaries
114
+ words = display_text.split()
115
+ lines = []
116
+ current_line = ""
117
+ for word in words:
118
+ if len(current_line + " " + word) <= 20:
119
+ current_line = current_line + " " + word if current_line else word
120
+ else:
121
+ if current_line:
122
+ lines.append(current_line)
123
+ current_line = word
124
+ if current_line:
125
+ lines.append(current_line)
126
+ display_text = "\n".join(lines[:2]) # Max 2 lines
127
+
128
+ # Calculate box dimensions based on text
129
+ num_lines = display_text.count('\n') + 1
130
+ text_width = max(len(line) for line in display_text.split('\n')) if display_text else 0
131
+ box_width = max(1.2, min(3.0, text_width * 0.15))
132
+ box_height = max(0.8, 0.6 + (num_lines - 1) * 0.4)
133
+ else:
134
+ display_text = ""
135
+ box_width = 1.2
136
+ box_height = 0.8
137
+
138
+ # Draw Box
139
+ box = FancyBboxPatch(
140
+ (info['x']-box_width/2, info['y']-box_height/2), box_width, box_height,
141
+ boxstyle="round,pad=0.1",
142
+ facecolor=face, edgecolor=edge, linewidth=lw, zorder=2
143
+ )
144
+ ax.add_patch(box)
145
+
146
+ # Draw Text Label - only show actual text from XML, make it readable
147
+ if display_text:
148
+ # Use larger, readable font - adjust based on text length
149
+ max_line_len = max(len(line) for line in display_text.split('\n')) if '\n' in display_text else len(display_text)
150
+ if max_line_len <= 10:
151
+ fontsize = 11
152
+ elif max_line_len <= 15:
153
+ fontsize = 10
154
+ else:
155
+ fontsize = 9
156
+
157
+ ax.text(info['x'], info['y'], display_text,
158
+ ha='center', va='center',
159
+ fontsize=fontsize,
160
+ zorder=3,
161
+ family='sans-serif',
162
+ weight='normal')
163
+
164
+ # ==========================================
165
+ # 2. Bounding Box Visualizer (Refactored)
166
+ # ==========================================
167
+ class BoundingBoxVisualizer:
168
+ COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
169
+
170
+ def visualize(self, image_path: str, xml_path: str, output_path: str):
171
+ img = cv2.imread(image_path)
172
+ if img is None: return
173
+
174
+ tree = ET.parse(xml_path)
175
+ idx = 0
176
+
177
+ for elem in tree.getroot().iter():
178
+ bounds = elem.get('bounds')
179
+ if bounds:
180
+ # Parse [x1,y1][x2,y2]
181
+ match = re.match(r'\[(\d+),(\d+)\]\[(\d+),(\d+)\]', bounds)
182
+ if match:
183
+ x1, y1, x2, y2 = map(int, match.groups())
184
+ color = self.COLORS[idx % len(self.COLORS)]
185
+ cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
186
+ idx += 1
187
+
188
+ cv2.imwrite(output_path, img)