Spaces:

Suhasdev
/

Xml-Cleaner

Sleeping

App Files Files Community

Suhasdev commited on Dec 30, 2025

Commit

ba6e49b

0 Parent(s):

Refactor XML Cleaner with dependency injection and MinHash-based similarity matching

Browse files

Files changed (7) hide show

.gitattributes +35 -0
.gitignore +15 -0
README.md +40 -0
app.py +241 -0
core_cleaner.py +514 -0
ocr_strategies.py +54 -0
visualizer.py +188 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+env/
+ENV/
+.venv
+*.egg-info/
+dist/
+build/
+.DS_Store
+*.log

README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+title: Intelligent XML Cleaner
+emoji: 🌳
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 6.1.0
+app_file: app.py
+pinned: false
+---
+# Intelligent XML Cleaner & Visualizer
+This tool helps Android developers and QA engineers clean stale accessibility node information from UI XML dumps.
+## Features
+* **Active-Based Sibling Pruning:** Intelligently removes XML nodes that are not visible on the screen based on OCR analysis or manual text input.
+* **Flexible Text Input:** Optionally provide visible text manually, or use OCR for automatic extraction.
+* **Dual OCR Strategy:** Choose between **EasyOCR** (Deep Learning based, high accuracy) or **Tesseract** (Fast, standard) as fallback when manual text is not provided.
+* **Comprehensive Visualization:**
+  * **Tree View:** See the hierarchical structure of your XML before and after cleaning.
+  * **Screen View:** Visual confirmation of bounding boxes overlaid on the original screenshot.
+## How to use
+1. Upload the Screenshot of the app state.
+2. Upload the corresponding XML dump (from `uiautomator`).
+3. **(Optional)** Enter visible text from the screenshot manually (one per line or comma-separated). If left empty, OCR will be used automatically.
+4. Select your preferred OCR engine (only used if visible text is not provided).
+5. Click **Process**.
+6. View the comparisons in the tabs and download the cleaned XML.
+## Technical Details
+This application uses a sophisticated pipeline:
+1. **Text Extraction:** Uses provided visible text (if available) or extracts visible text from the image using OCR.
+2. **LCA Calculation:** Finds the Lowest Common Ancestor of all active elements.
+3. **Pruning:** Traverses upward from the Active LCA and prunes siblings that contain no visible text.

app.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import gradio as gr
+import os
+import tempfile
+import shutil
+from pathlib import Path
+import time
+# Import our modules
+from ocr_strategies import OCRFactory
+from core_cleaner import XMLCleanerCore
+from visualizer import XMLTreeVisualizer, BoundingBoxVisualizer
+# Initialize Logic Classes
+cleaner_core = XMLCleanerCore()
+tree_viz = XMLTreeVisualizer()
+bbox_viz = BoundingBoxVisualizer()
+def process_pipeline(image_file, xml_file, ocr_choice, visible_text_input, progress=gr.Progress()):
+    # 1. Validation
+    if xml_file is None:
+        raise gr.Error("Please upload XML file.")
+    # Check if we need image (only if visible text is not provided)
+    use_ocr = not (visible_text_input and visible_text_input.strip())
+    if use_ocr and image_file is None:
+        raise gr.Error("Please upload Image file when using OCR, or provide visible text manually.")
+    start_time = time.time()
+    # 2. Setup Paths (Safe Temp Files)
+    temp_dir = Path(tempfile.gettempdir())
+    unique_id = str(int(time.time()))
+    # Paths for outputs
+    cleaned_xml_path = temp_dir / f"cleaned_{unique_id}.xml"
+    img_viz_before = temp_dir / f"bbox_before_{unique_id}.png"
+    img_viz_after = temp_dir / f"bbox_after_{unique_id}.png"
+    tree_viz_before = temp_dir / f"tree_before_{unique_id}.png"
+    tree_viz_after = temp_dir / f"tree_after_{unique_id}.png"
+    # 3. Text Extraction Stage (OCR or Manual Input)
+    text_source = None
+    if visible_text_input and visible_text_input.strip():
+        # Use provided visible text - NO OCR NEEDED
+        progress(0.2, desc="Using provided visible text (OCR skipped)...")
+        # Convert input text to set of strings (split by newlines or commas)
+        lines = [line.strip() for line in visible_text_input.replace(',', '\n').split('\n') if line.strip()]
+        visible_text = {line.lower().strip() for line in lines if line.strip()}
+        text_source = "Manual Input"
+    else:
+        # Use OCR - image is required here
+        progress(0.2, desc="Running OCR on image...")
+        ocr_engine = OCRFactory.get_strategy(ocr_choice)
+        visible_text = ocr_engine.extract_text(image_file)
+        text_source = ocr_choice
+    # 4. XML Parsing & Detection
+    progress(0.4, desc="Parsing XML...")
+    tree, root, parent_map = cleaner_core.parse_xml(xml_file)
+    progress(0.5, desc="Detecting Stale Elements...")
+    active, stale = cleaner_core.find_active_and_stale(root, visible_text)
+    # 5. Pruning
+    progress(0.6, desc="Pruning Tree...")
+    removed_count = 0
+    if stale:
+        removed_count = cleaner_core.prune_stale_subtrees(root, active, stale, parent_map)
+    # Save Cleaned XML
+    tree.write(str(cleaned_xml_path))
+    # 6. Visualization Generation
+    progress(0.7, desc="Generating Visualizations...")
+    # Bounding Boxes (only if image is provided)
+    if image_file is not None:
+        bbox_viz.visualize(image_file, xml_file, str(img_viz_before))
+        bbox_viz.visualize(image_file, str(cleaned_xml_path), str(img_viz_after))
+    else:
+        # Create placeholder images or skip
+        img_viz_before = None
+        img_viz_after = None
+    # Trees
+    progress(0.8, desc="Drawing Trees (This might take a moment)...")
+    # Before: no highlights
+    tree_viz.visualize(xml_file, str(tree_viz_before), visible_text=None, active_elements=None)
+    # After: highlight active elements (OCR matched nodes)
+    active_elements_set = set(active) if active else set()
+    tree_viz.visualize(str(cleaned_xml_path), str(tree_viz_after), visible_text, active_elements_set)
+    # 7. Stats
+    total_time = time.time() - start_time
+    stats_md = f"""
+    ### 📊 Process Statistics
+    | Metric | Result |
+    | :--- | :--- |
+    | **Text Source** | {text_source} |
+    | **Elements Removed** | `{removed_count}` |
+    | **Active Elements** | `{len(active)}` |
+    | **Stale Elements** | `{len(stale)}` |
+    | **Processing Time** | `{total_time:.2f}s` |
+    """
+    ocr_text_display = "\n".join(sorted(list(visible_text)))
+    progress(1.0, desc="Done!")
+    return (
+        str(tree_viz_before),
+        str(tree_viz_after),
+        str(img_viz_before) if img_viz_before else None,
+        str(img_viz_after) if img_viz_after else None,
+        stats_md,
+        ocr_text_display,
+        str(cleaned_xml_path)
+    )
+# --- Gradio UI Layout ---
+custom_css = """
+.container { max-width: 1100px; margin: auto; }
+.header { text-align: center; margin-bottom: 20px; }
+.stat-box { border: 1px solid #ddd; padding: 10px; border-radius: 8px; background: #f9f9f9; }
+"""
+with gr.Blocks() as app:
+    with gr.Row():
+        gr.Markdown(
+            """
+            # 🌳 XML Cleaner & Visualizer Studio
+            **Optimize Mobile UI XMLs** by removing invisible/stale nodes using OCR-based or manual text input for sibling pruning.
+            """,
+            elem_classes="header"
+        )
+    with gr.Row():
+        # --- Left Panel: Inputs ---
+        with gr.Column(scale=1, variant="panel"):
+            gr.Markdown("### 1. Upload Data")
+            img_input = gr.Image(type="filepath", label="Screenshot (PNG/JPG)")
+            gr.Markdown("*Optional if visible text is provided below*")
+            xml_input = gr.File(label="XML Layout Dump", file_types=[".xml"])
+            gr.Markdown("### 2. Visible Text (Optional)")
+            visible_text_input = gr.TextArea(
+                label="Visible Text",
+                placeholder="Enter visible text from the screenshot (one per line or comma-separated). Leave empty to use OCR.",
+                lines=5,
+                info="If provided, this text will be used instead of OCR. Otherwise, OCR will be used automatically."
+            )
+            # Status indicator for text input mode
+            text_input_status = gr.Markdown("", visible=False)
+            gr.Markdown("### 3. Settings")
+            ocr_selector = gr.Dropdown(
+                choices=["EasyOCR (Best Accuracy)", "Tesseract (Fast & Free)"],
+                value="EasyOCR (Best Accuracy)",
+                label="OCR Engine (Fallback)",
+                info="Used only if visible text is not provided above.",
+                interactive=True
+            )
+            btn_run = gr.Button("✨ Run Analysis & Clean", variant="primary", size="lg")
+        # --- Right Panel: Outputs ---
+        with gr.Column(scale=2):
+            gr.Markdown("### 4. Analysis Results")
+            # Stats Area
+            stats_output = gr.Markdown()
+            # Visualization Tabs
+            with gr.Tabs():
+                with gr.TabItem("🌳 Tree Structure"):
+                    gr.Markdown("*Left: Original XML | Right: Cleaned XML (Active Nodes Highlighted)*")
+                    with gr.Row():
+                        out_tree_before = gr.Image(label="Before Pruning", type="filepath")
+                        out_tree_after = gr.Image(label="After Pruning", type="filepath")
+                with gr.TabItem("🖼️ Bounding Boxes"):
+                    gr.Markdown("*Visualizing XML bounds on the screenshot*")
+                    with gr.Row():
+                        out_bbox_before = gr.Image(label="Original Bounds", type="filepath")
+                        out_bbox_after = gr.Image(label="Cleaned Bounds", type="filepath")
+                with gr.TabItem("📝 OCR Data"):
+                    out_ocr_text = gr.TextArea(label="Detected Text", lines=10, interactive=False)
+            # Download
+            gr.Markdown("### 5. Export")
+            out_file = gr.File(label="Download Cleaned XML")
+    # Function to toggle OCR selector and image input based on visible text input
+    def toggle_ocr_selector(visible_text):
+        """Disable OCR selector if visible text is provided, enable if empty"""
+        if visible_text and visible_text.strip():
+            return (
+                gr.update(
+                    label="OCR Engine (Disabled - Using Manual Text)",
+                    info="⚠️ OCR is disabled because visible text is provided above.",
+                    interactive=False
+                ),
+                gr.update(value="✅ **Using Manual Text Input** - OCR is disabled. Image is optional.", visible=True),
+                gr.update(label="Screenshot (PNG/JPG) - Optional")
+            )
+        else:
+            return (
+                gr.update(
+                    label="OCR Engine",
+                    info="Select OCR engine to extract visible text from the screenshot.",
+                    interactive=True
+                ),
+                gr.update(value="", visible=False),
+                gr.update(label="Screenshot (PNG/JPG) - Required")
+            )
+    # Wire Interactions
+    # Update OCR selector and image input when visible text changes
+    visible_text_input.change(
+        fn=toggle_ocr_selector,
+        inputs=[visible_text_input],
+        outputs=[ocr_selector, text_input_status, img_input]
+    )
+    btn_run.click(
+        fn=process_pipeline,
+        inputs=[img_input, xml_input, ocr_selector, visible_text_input],
+        outputs=[
+            out_tree_before, out_tree_after,
+            out_bbox_before, out_bbox_after,
+            stats_output, out_ocr_text, out_file
+        ]
+    )
+if __name__ == "__main__":
+    app.launch(css=custom_css, theme=gr.themes.Soft())

core_cleaner.py ADDED Viewed

	@@ -0,0 +1,514 @@

+"""
+Refactored XML Cleaner with dependency injection.
+"""
+import time
+from pathlib import Path
+from typing import Dict, List, Tuple, Set, Optional
+from logging import getLogger
+import xml.etree.ElementTree as ET
+import asyncio
+import re
+from datasketch import MinHash
+from ocr_strategies import OCRStrategy
+from functools import wraps
+logger = getLogger(__name__)
+# Simple decorators for compatibility (can be enhanced later)
+def profile_it(func_name: str = "", tags: dict = None):
+    """Simple profiling decorator placeholder"""
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            return await func(*args, **kwargs)
+        return wrapper
+    return decorator
+def json_log():
+    """Simple logging decorator placeholder"""
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(*args, **kwargs):
+            return await func(*args, **kwargs)
+        return wrapper
+    return decorator
+# Adapter to make OCRStrategy async-compatible
+class BaseOCR:
+    """Adapter to make OCRStrategy work with async interface"""
+    def __init__(self, ocr_strategy: OCRStrategy):
+        self._strategy = ocr_strategy
+    async def extract_visible_text(self, image_path: Path) -> Set[str]:
+        """Extract visible text asynchronously"""
+        def _extract():
+            return self._strategy.extract_text(str(image_path))
+        return await asyncio.to_thread(_extract)
+class DataLoader:
+    """Loads and validates image and XML inputs asynchronously"""
+    async def load_inputs(self, image_path: str, xml_path: str) -> Tuple[Path, ET.ElementTree]:
+        img_path = Path(image_path)
+        xml_file = Path(xml_path)
+        if not img_path.exists():
+            raise FileNotFoundError(f"Image not found: {image_path}")
+        if not xml_file.exists():
+            raise FileNotFoundError(f"XML not found: {xml_path}")
+        def _parse_xml():
+            return ET.parse(xml_file)
+        tree = await asyncio.to_thread(_parse_xml)
+        return img_path, tree
+class XMLParser:
+    """Parses XML and builds parent-child relationships"""
+    def parse_xml(self, tree: ET.ElementTree) -> Tuple[ET.Element, Dict[ET.Element, ET.Element]]:
+        root = tree.getroot()
+        parent_map = {child: parent for parent in root.iter() for child in parent}
+        return root, parent_map
+class StaleDetector:
+    """
+    Identifies active and stale elements by comparing XML with OCR results.
+    Logic:
+    - Active elements: XML elements whose text matches OCR results (visible on screen)
+    - Stale elements: XML elements whose text does NOT match OCR results (not visible)
+    """
+    def find_active_and_stale(
+        self,
+        root: ET.Element,
+        visible_text: Set[str]
+    ) -> Tuple[List[ET.Element], List[ET.Element]]:
+        """
+        Compare all XML elements with OCR results to classify as active or stale.
+        Args:
+            root: Root element of the XML tree to process
+            visible_text: Set of text strings extracted from OCR (visible on screen)
+        Returns:
+            Tuple containing:
+                - List of active elements (text matches OCR results)
+                - List of stale elements (text does not match OCR results)
+        """
+        active_elements = []
+        stale_elements = []
+        filtered_ocr = {t for t in visible_text if len(t) > 2}
+        for elem in root.iter():
+            text = elem.get('text', '').lower().strip()
+            if text and len(text) > 2:
+                if self._is_similar(text, filtered_ocr):
+                    active_elements.append(elem)
+                else:
+                    stale_elements.append(elem)
+        return active_elements, stale_elements
+    def _is_similar(self, elem_text: str, visible_text: Set[str]) -> bool:
+        """
+        Check if text matches any visible text using MinHash-based Jaccard similarity.
+        Uses MinHash algorithm for efficient similarity comparison:
+        1. Tokenizes element text and OCR text into word tokens
+        2. Creates MinHash signatures for both texts
+        3. Calculates Jaccard similarity between MinHashes
+        4. Returns True if similarity >= 50%
+        MinHash provides better performance and accuracy for text similarity
+        compared to simple token overlap, especially for longer texts.
+        Args:
+            elem_text: Text from XML element
+            visible_text: Set of text strings from OCR results
+        Returns:
+            True if element text semantically matches any OCR text, False otherwise
+        """
+        # Tokenize element text into word tokens
+        elem_tokens = re.findall(r'\w+', elem_text.lower())
+        # Create MinHash for element text
+        if not elem_tokens:
+            return False
+        elem_minhash = MinHash(num_perm=32)
+        for token in elem_tokens:
+            elem_minhash.update(token.encode())
+        # Compare with each OCR text result using MinHash
+        for ocr_text in visible_text:
+            # Tokenize OCR text into word tokens
+            ocr_tokens = re.findall(r'\w+', ocr_text.lower())
+            if not ocr_tokens:
+                continue
+            # Create MinHash for OCR text
+            ocr_minhash = MinHash(num_perm=32)
+            for token in ocr_tokens:
+                ocr_minhash.update(token.encode())
+            # Calculate Jaccard similarity using MinHash
+            similarity = elem_minhash.jaccard(ocr_minhash)
+            # Match if similarity >= 50%
+            if similarity >= 0.5:
+                return True
+        return False
+class LCAFinder:
+    """Finds lowest common ancestor of elements"""
+    def find_lca(
+        self,
+        elements: List[ET.Element],
+        parent_map: Dict[ET.Element, ET.Element]
+    ) -> Optional[ET.Element]:
+        """
+        Find the lowest common ancestor (LCA) of a list of XML elements.
+        Traverses up from each element to the root, finding the deepest node
+        that is an ancestor of all given elements.
+        Args:
+            elements: List of XML elements to find LCA for
+            parent_map: Dictionary mapping each element to its parent
+        Returns:
+            The lowest common ancestor element, or None if elements list is empty
+        """
+        if not elements:
+            return None
+        paths = [self._get_path_to_root(elem, parent_map) for elem in elements]
+        min_length = min(len(path) for path in paths)
+        lca = None
+        for i in range(min_length):
+            if len(set(path[i] for path in paths)) == 1:
+                lca = paths[0][i]
+            else:
+                break
+        return lca
+    def _get_path_to_root(
+        self,
+        elem: ET.Element,
+        parent_map: Dict[ET.Element, ET.Element]
+    ) -> List[ET.Element]:
+        path = []
+        current = elem
+        while current is not None:
+            path.append(current)
+            current = parent_map.get(current)
+        return list(reversed(path))
+class ActiveBasedPruner:
+    """Prunes stale subtrees by traversing up from active LCA"""
+    def find_and_prune_stale_subtrees(
+        self,
+        root: ET.Element,
+        active_elements: List[ET.Element],
+        stale_elements: List[ET.Element],
+        parent_map: Dict[ET.Element, ET.Element]
+    ) -> int:
+        if not active_elements:
+            return 0
+        stale_set = set(stale_elements)
+        lca_finder = LCAFinder()
+        active_lca = lca_finder.find_lca(active_elements, parent_map)
+        if active_lca is None:
+            return 0
+        total_removed = 0
+        current = active_lca
+        while current is not None:
+            parent = parent_map.get(current)
+            if parent is None:
+                break
+            siblings = [child for child in parent if child != current]
+            for sibling in siblings:
+                if self._subtree_contains_stale(sibling, stale_set):
+                    removed = len(list(sibling.iter()))
+                    parent.remove(sibling)
+                    total_removed += removed
+            current = parent
+        return total_removed
+    def _subtree_contains_stale(self, node: ET.Element, stale_set: Set[ET.Element]) -> bool:
+        for elem in node.iter():
+            if elem in stale_set:
+                return True
+        return False
+class XMLWriter:
+    """Writes cleaned XML to file asynchronously"""
+    async def save_cleaned_xml(self, tree: ET.ElementTree, output_path: str) -> None:
+        def _write_xml():
+            tree.write(output_path, encoding='utf-8', xml_declaration=True)
+        await asyncio.to_thread(_write_xml)
+class XMLCleaner:
+    """
+    XML Cleaner with injected OCR dependency.
+    Now testable and flexible!
+    """
+    def __init__(
+        self,
+        ocr: BaseOCR,
+        thread_code: str = ""
+    ):
+        """
+        Args:
+            ocr: OCR provider (BaseOCR adapter wrapping OCRStrategy)
+            thread_code: Thread code for logging
+        """
+        self._ocr = ocr
+        self._thread_code = thread_code
+        # Create instances of supporting classes
+        self._loader = DataLoader()
+        self._parser = XMLParser()
+        self._detector = StaleDetector()
+        self._pruner = ActiveBasedPruner()
+        self._writer = XMLWriter()
+    @json_log()
+    @profile_it(
+        func_name="xml_cleaner",
+        tags={"operation_type": "xml_processing", "workflow": ""}
+    )
+    async def clean(
+        self,
+        image_path: str,
+        xml_path: str,
+        output_path: str
+    ) -> Dict:
+        """
+        Main workflow - simplified interface.
+        Args:
+            image_path: Path to screenshot
+            xml_path: Path to XML dump
+            output_path: Path to save cleaned XML
+        Returns:
+            Statistics dict with detailed latency breakdown
+        """
+        total_start_time = time.perf_counter()
+        try:
+            load_start = time.perf_counter()
+            img_path, tree = await self._loader.load_inputs(image_path, xml_path)
+            load_latency = time.perf_counter() - load_start
+            ocr_start = time.perf_counter()
+            visible_text = await self._ocr.extract_visible_text(img_path)
+            ocr_latency = time.perf_counter() - ocr_start
+            parse_start = time.perf_counter()
+            root, parent_map = self._parser.parse_xml(tree)
+            total_elements = len(list(root.iter()))
+            parse_latency = time.perf_counter() - parse_start
+            detect_start = time.perf_counter()
+            active_elements, stale_elements = self._detector.find_active_and_stale(
+                root, visible_text
+            )
+            detect_latency = time.perf_counter() - detect_start
+            # Early exit if no stale elements
+            if not stale_elements:
+                total_latency = time.perf_counter() - total_start_time
+                return {
+                    'status': 'clean',
+                    'removed': 0,
+                    'total_elements': total_elements,
+                    'active_count': len(active_elements),
+                    'visible_text_count': len(visible_text),
+                    'load_latency': load_latency,
+                    'ocr_latency': ocr_latency,
+                    'parse_latency': parse_latency,
+                    'detection_latency': detect_latency,
+                    'total_latency': total_latency
+                }
+            prune_start = time.perf_counter()
+            removed = self._pruner.find_and_prune_stale_subtrees(
+                root, active_elements, stale_elements, parent_map
+            )
+            prune_latency = time.perf_counter() - prune_start
+            save_start = time.perf_counter()
+            await self._writer.save_cleaned_xml(tree, output_path)
+            save_latency = time.perf_counter() - save_start
+            total_latency = time.perf_counter() - total_start_time
+            logger.info(
+                f"XML cleaning completed: removed {removed}/{total_elements} elements "
+                f"in {total_latency:.2f}s"
+            )
+            return {
+                'status': 'cleaned',
+                'removed': removed,
+                'total_elements': total_elements,
+                'method': 'active_based_sibling_pruning',
+                'active_count': len(active_elements),
+                'stale_count': len(stale_elements),
+                'visible_text_count': len(visible_text),
+                'load_latency': load_latency,
+                'ocr_latency': ocr_latency,
+                'parse_latency': parse_latency,
+                'detection_latency': detect_latency,
+                'pruning_latency': prune_latency,
+                'save_latency': save_latency,
+                'total_latency': total_latency
+            }
+        except Exception as e:
+            logger.error(f"Error in XML cleaner: {e}", exc_info=True)
+            total_latency = time.perf_counter() - total_start_time
+            return {
+                'status': 'error',
+                'error': str(e),
+                'total_latency': total_latency
+            }
+# Backward compatibility: Keep the old class name
+class XMLCleanerCore:
+    def __init__(self):
+        pass # Stateless, pure logic
+    def parse_xml(self, xml_path):
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        parent_map = {c: p for p in root.iter() for c in p}
+        return tree, root, parent_map
+    def find_active_and_stale(self, root, visible_text: Set[str]):
+        active_elements = []
+        stale_elements = []
+        # Filter noise from OCR
+        clean_ocr = {t for t in visible_text if len(t) > 2}
+        for elem in root.iter():
+            text = elem.get('text', '').lower().strip()
+            if text and len(text) > 2:
+                if self._is_semantic_match(text, clean_ocr):
+                    active_elements.append(elem)
+                else:
+                    stale_elements.append(elem)
+        return active_elements, stale_elements
+    def _is_semantic_match(self, elem_text, visible_texts):
+        # Token based matching
+        elem_tokens = set(re.findall(r'\w+', elem_text))
+        for ocr in visible_texts:
+            ocr_tokens = set(re.findall(r'\w+', ocr))
+            if not elem_tokens: continue
+            overlap = len(elem_tokens & ocr_tokens)
+            if overlap / len(elem_tokens) >= 0.5: # 50% match
+                return True
+        return False
+    def prune_stale_subtrees(self, root, active_elements, stale_elements, parent_map):
+        if not active_elements: return 0
+        # 1. Find LCA of active elements
+        active_lca = self._find_lca(active_elements, parent_map)
+        if not active_lca: return 0
+        stale_set = set(stale_elements)
+        removed_count = 0
+        current = active_lca
+        # 2. Traverse Up and Prune Siblings
+        while current is not None:
+            parent = parent_map.get(current)
+            if not parent: break
+            siblings = [child for child in parent if child != current]
+            for sibling in siblings:
+                # If sibling tree has stale elements?
+                # Simplified: If sibling is strictly in stale list or contains them
+                if self._subtree_has_stale(sibling, stale_set):
+                    removed_count += len(list(sibling.iter()))
+                    parent.remove(sibling)
+            current = parent
+        return removed_count
+    def _subtree_has_stale(self, node, stale_set):
+        for x in node.iter():
+            if x in stale_set: return True
+        return False
+    def _find_lca(self, elements, parent_map):
+        # Get paths to root
+        paths = []
+        for el in elements:
+            path = []
+            curr = el
+            while curr:
+                path.append(curr)
+                curr = parent_map.get(curr)
+            paths.append(list(reversed(path)))
+        if not paths: return None
+        # Find common prefix
+        min_len = min(len(p) for p in paths)
+        lca = None
+        for i in range(min_len):
+            if len(set(p[i] for p in paths)) == 1:
+                lca = paths[0][i]
+            else:
+                break
+        return lca

ocr_strategies.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import easyocr
+import pytesseract
+from abc import ABC, abstractmethod
+from typing import Set
+import cv2
+import numpy as np
+class OCRStrategy(ABC):
+    """Abstract base class for OCR strategies."""
+    @abstractmethod
+    def extract_text(self, image_path: str) -> Set[str]:
+        pass
+class EasyOCRStrategy(OCRStrategy):
+    """Concrete strategy for EasyOCR."""
+    def __init__(self):
+        # Initialize once to save memory/time
+        print("Loading EasyOCR Model...")
+        self.reader = easyocr.Reader(['en'], gpu=False)
+    def extract_text(self, image_path: str) -> Set[str]:
+        results = self.reader.readtext(image_path, detail=0)
+        return {text.lower().strip() for text in results if text.strip()}
+class TesseractOCRStrategy(OCRStrategy):
+    """Concrete strategy for Tesseract OCR (Free & Fast)."""
+    def extract_text(self, image_path: str) -> Set[str]:
+        # Preprocessing for better Tesseract accuracy
+        img = cv2.imread(image_path)
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # Apply thresholding
+        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
+        # Tesseract execution
+        text = pytesseract.image_to_string(thresh)
+        # Process results
+        lines = text.split('\n')
+        return {line.lower().strip() for line in lines if line.strip()}
+class OCRFactory:
+    """Factory to get the correct OCR strategy."""
+    _instances = {}
+    @staticmethod
+    def get_strategy(strategy_name: str) -> OCRStrategy:
+        if strategy_name not in OCRFactory._instances:
+            if strategy_name == "EasyOCR (Best Accuracy)":
+                OCRFactory._instances[strategy_name] = EasyOCRStrategy()
+            elif strategy_name == "Tesseract (Fast & Free)":
+                OCRFactory._instances[strategy_name] = TesseractOCRStrategy()
+            else:
+                raise ValueError(f"Unknown strategy: {strategy_name}")
+        return OCRFactory._instances[strategy_name]

visualizer.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import matplotlib
+matplotlib.use('Agg') # Non-interactive backend for web apps
+import matplotlib.pyplot as plt
+from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
+import xml.etree.ElementTree as ET
+import cv2
+import re
+from typing import Dict, Set, Tuple
+# ==========================================
+# 1. Tree Visualizer (Refactored from your upload)
+# ==========================================
+class XMLTreeVisualizer:
+    # Single color scheme for all nodes
+    DEFAULT_COLOR = {'fill': '#E3F2FD', 'border': '#1976D2', 'text': '#000000'}
+    HIGHLIGHT_COLOR = {'fill': '#FFF59D', 'border': '#F57F17', 'text': '#000000'}  # Yellow for active nodes
+    def visualize(self, xml_path: str, output_path: str, visible_text: Set[str] = None, active_elements: Set = None):
+        """Generates tree visualization. If visible_text and active_elements are provided, highlights active nodes."""
+        tree = ET.parse(xml_path)
+        root = tree.getroot()
+        # Calculate Layout
+        positions = self._calculate_layout(root)
+        max_depth = max(p['level'] for p in positions.values())
+        # Setup Figure with larger size for better text readability
+        fig, ax = plt.subplots(figsize=(24, 18))
+        ax.set_xlim(-len(positions)*0.5, len(positions)*0.5)
+        ax.set_ylim(-max_depth * 2 - 2, 2)
+        ax.axis('off')
+        # Draw Edges
+        self._draw_edges(ax, positions, root)
+        # Draw Nodes
+        self._draw_nodes(ax, positions, visible_text, active_elements)
+        plt.title("XML Tree Structure" + (" (Active Nodes Highlighted)" if active_elements else ""), fontsize=16)
+        plt.tight_layout()
+        plt.savefig(output_path, dpi=150, bbox_inches='tight')
+        plt.close(fig)
+    def _calculate_layout(self, root, x=0, y=0, level=0, spacing=2.5):
+        positions = {}
+        def _get_width(node):
+            children = list(node)
+            if not children: return 1.0
+            return sum(_get_width(c) for c in children)
+        def _assign_pos(node, curr_x, curr_y, curr_level):
+            positions[id(node)] = {'x': curr_x, 'y': curr_y, 'level': curr_level, 'element': node}
+            children = list(node)
+            if not children: return
+            width = sum(_get_width(c) for c in children)
+            start_x = curr_x - (width * spacing / 2)
+            current_offset = 0
+            for child in children:
+                child_w = _get_width(child)
+                child_x = start_x + (current_offset + child_w/2) * spacing
+                _assign_pos(child, child_x, curr_y - 2, curr_level + 1)
+                current_offset += child_w
+        _assign_pos(root, x, y, level)
+        return positions
+    def _draw_edges(self, ax, positions, node):
+        node_id = id(node)
+        if node_id not in positions: return
+        parent_pos = positions[node_id]
+        for child in node:
+            child_id = id(child)
+            if child_id in positions:
+                child_pos = positions[child_id]
+                arrow = FancyArrowPatch(
+                    (parent_pos['x'], parent_pos['y']),
+                    (child_pos['x'], child_pos['y']),
+                    arrowstyle='-', color='#555', linewidth=1, zorder=1
+                )
+                ax.add_patch(arrow)
+                self._draw_edges(ax, positions, child)
+    def _draw_nodes(self, ax, positions, visible_text, active_elements):
+        for pid, info in positions.items():
+            elem = info['element']
+            text = elem.get('text', '').strip()
+            # Highlight Logic: Check if this element is in the active_elements set
+            is_highlight = False
+            if active_elements and elem in active_elements:
+                is_highlight = True
+            # Use single color scheme
+            if is_highlight:
+                face = self.HIGHLIGHT_COLOR['fill']
+                edge = self.HIGHLIGHT_COLOR['border']
+                lw = 3
+            else:
+                face = self.DEFAULT_COLOR['fill']
+                edge = self.DEFAULT_COLOR['border']
+                lw = 1
+            # Calculate box size based on text length
+            if text:
+                # Use actual text, wrap if too long
+                display_text = text
+                # Wrap text if longer than 20 characters
+                if len(display_text) > 20:
+                    # Try to break at word boundaries
+                    words = display_text.split()
+                    lines = []
+                    current_line = ""
+                    for word in words:
+                        if len(current_line + " " + word) <= 20:
+                            current_line = current_line + " " + word if current_line else word
+                        else:
+                            if current_line:
+                                lines.append(current_line)
+                            current_line = word
+                    if current_line:
+                        lines.append(current_line)
+                    display_text = "\n".join(lines[:2])  # Max 2 lines
+                # Calculate box dimensions based on text
+                num_lines = display_text.count('\n') + 1
+                text_width = max(len(line) for line in display_text.split('\n')) if display_text else 0
+                box_width = max(1.2, min(3.0, text_width * 0.15))
+                box_height = max(0.8, 0.6 + (num_lines - 1) * 0.4)
+            else:
+                display_text = ""
+                box_width = 1.2
+                box_height = 0.8
+            # Draw Box
+            box = FancyBboxPatch(
+                (info['x']-box_width/2, info['y']-box_height/2), box_width, box_height,
+                boxstyle="round,pad=0.1",
+                facecolor=face, edgecolor=edge, linewidth=lw, zorder=2
+            )
+            ax.add_patch(box)
+            # Draw Text Label - only show actual text from XML, make it readable
+            if display_text:
+                # Use larger, readable font - adjust based on text length
+                max_line_len = max(len(line) for line in display_text.split('\n')) if '\n' in display_text else len(display_text)
+                if max_line_len <= 10:
+                    fontsize = 11
+                elif max_line_len <= 15:
+                    fontsize = 10
+                else:
+                    fontsize = 9
+                ax.text(info['x'], info['y'], display_text,
+                       ha='center', va='center',
+                       fontsize=fontsize,
+                       zorder=3,
+                       family='sans-serif',
+                       weight='normal')
+# ==========================================
+# 2. Bounding Box Visualizer (Refactored)
+# ==========================================
+class BoundingBoxVisualizer:
+    COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0)]
+    def visualize(self, image_path: str, xml_path: str, output_path: str):
+        img = cv2.imread(image_path)
+        if img is None: return
+        tree = ET.parse(xml_path)
+        idx = 0
+        for elem in tree.getroot().iter():
+            bounds = elem.get('bounds')
+            if bounds:
+                # Parse [x1,y1][x2,y2]
+                match = re.match(r'\[(\d+),(\d+)\]\[(\d+),(\d+)\]', bounds)
+                if match:
+                    x1, y1, x2, y2 = map(int, match.groups())
+                    color = self.COLORS[idx % len(self.COLORS)]
+                    cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
+                    idx += 1
+        cv2.imwrite(output_path, img)