Spaces:

seanpedrickcase
/

document_redaction_vlm

Running on Zero

File size: 54,727 Bytes

43bfad5

import os
from typing import Dict, List, Tuple

import cv2
import numpy as np

from tools.config import OUTPUT_FOLDER, SAVE_WORD_SEGMENTER_OUTPUT_IMAGES

# Adaptive thresholding parameters
BLOCK_SIZE_FACTOR = 1.5  # Multiplier for adaptive threshold block size
C_VALUE = 2  # Constant subtracted from mean in adaptive thresholding

# Word segmentation search parameters
INITIAL_KERNEL_WIDTH_FACTOR = 0.0  # Starting kernel width factor for Stage 2 search
INITIAL_VALLEY_THRESHOLD_FACTOR = (
    0.0  # Starting valley threshold factor for Stage 1 search
)
MAIN_VALLEY_THRESHOLD_FACTOR = (
    0.15  # Primary valley threshold factor for word separation
)
MIN_SPACE_FACTOR = 0.2  # Minimum space width relative to character width
MATCH_TOLERANCE = 0  # Tolerance for word count matching

# Noise removal parameters
MIN_AREA_THRESHOLD = 6  # Minimum component area to be considered valid text
DEFAULT_TRIM_PERCENTAGE = (
    0.2  # Percentage to trim from top/bottom for vertical cropping
)

# Skew detection parameters
MIN_SKEW_THRESHOLD = 0.5  # Ignore angles smaller than this (likely noise)
MAX_SKEW_THRESHOLD = 15.0  # Angles larger than this are extreme and likely errors


def _sanitize_filename(filename: str, max_length: int = 100) -> str:
    """
    Sanitizes a string to be used as a valid filename.
    Removes or replaces invalid characters for Windows/Linux file systems.

    Args:
        filename: The string to sanitize
        max_length: Maximum length of the sanitized filename

    Returns:
        A sanitized string safe for use in file names
    """
    if not filename:
        return "unnamed"

    # Replace spaces with underscores
    sanitized = filename.replace(" ", "_")

    # Remove or replace invalid characters for Windows/Linux
    # Invalid: < > : " / \ | ? *
    invalid_chars = '<>:"/\\|?*'
    for char in invalid_chars:
        sanitized = sanitized.replace(char, "_")

    # Remove control characters
    sanitized = "".join(
        char for char in sanitized if ord(char) >= 32 or char in "\n\r\t"
    )

    # Remove leading/trailing dots and spaces (Windows doesn't allow these)
    sanitized = sanitized.strip(". ")

    # Replace multiple consecutive underscores with a single one
    while "__" in sanitized:
        sanitized = sanitized.replace("__", "_")

    # Truncate if too long
    if len(sanitized) > max_length:
        sanitized = sanitized[:max_length]

    # Ensure it's not empty after sanitization
    if not sanitized:
        sanitized = "unnamed"

    return sanitized


class AdaptiveSegmenter:
    """
    Line to word segmentation pipeline. It features:
    1. Adaptive Thresholding.
    2. Targeted Noise Removal using Connected Component Analysis.
    3. The robust two-stage adaptive search (Valley -> Kernel).
    4. CCA for final pixel-perfect refinement.
    """

    def __init__(self, output_folder: str = OUTPUT_FOLDER):
        self.output_folder = output_folder
        self.fallback_segmenter = HybridWordSegmenter()

    def _correct_orientation(
        self, gray_image: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Detects and corrects 90-degree orientation issues.
        """
        h, w = gray_image.shape
        center = (w // 2, h // 2)

        block_size = 21
        if h < block_size:
            block_size = h if h % 2 != 0 else h - 1

        if block_size > 3:
            binary = cv2.adaptiveThreshold(
                gray_image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY_INV,
                block_size,
                4,
            )
        else:
            _, binary = cv2.threshold(
                gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
            )

        opening_kernel = np.ones((2, 2), np.uint8)
        binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)

        coords = np.column_stack(np.where(binary > 0))
        if len(coords) < 50:
            M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
            return gray_image, M_orient

        ymin, xmin = coords.min(axis=0)
        ymax, xmax = coords.max(axis=0)
        box_height = ymax - ymin
        box_width = xmax - xmin

        orientation_angle = 0.0
        if box_height > box_width:
            orientation_angle = 90.0
        else:
            M_orient = cv2.getRotationMatrix2D(center, 0, 1.0)
            return gray_image, M_orient

        M_orient = cv2.getRotationMatrix2D(center, orientation_angle, 1.0)
        new_w, new_h = h, w
        M_orient[0, 2] += (new_w - w) / 2
        M_orient[1, 2] += (new_h - h) / 2

        oriented_gray = cv2.warpAffine(
            gray_image,
            M_orient,
            (new_w, new_h),
            flags=cv2.INTER_CUBIC,
            borderMode=cv2.BORDER_REPLICATE,
        )

        return oriented_gray, M_orient

    def _deskew_image(self, gray_image: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Detects skew using a robust method that normalizes minAreaRect.
        """
        h, w = gray_image.shape

        block_size = 21
        if h < block_size:
            block_size = h if h % 2 != 0 else h - 1

        if block_size > 3:
            binary = cv2.adaptiveThreshold(
                gray_image,
                255,
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY_INV,
                block_size,
                4,
            )
        else:
            _, binary = cv2.threshold(
                gray_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
            )

        opening_kernel = np.ones((2, 2), np.uint8)
        binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, opening_kernel)

        coords = np.column_stack(np.where(binary > 0))
        if len(coords) < 50:
            M = cv2.getRotationMatrix2D((w // 2, h // 2), 0, 1.0)
            return gray_image, M

        rect = cv2.minAreaRect(coords[:, ::-1])
        rect_width, rect_height = rect[1]
        angle = rect[2]

        if rect_width < rect_height:
            rect_width, rect_height = rect_height, rect_width
            angle += 90

        if angle > 45:
            angle -= 90
        elif angle < -45:
            angle += 90

        correction_angle = angle

        if abs(correction_angle) < MIN_SKEW_THRESHOLD:
            correction_angle = 0.0
        elif abs(correction_angle) > MAX_SKEW_THRESHOLD:
            correction_angle = 0.0

        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, correction_angle, 1.0)

        deskewed_gray = cv2.warpAffine(
            gray_image,
            M,
            (w, h),
            flags=cv2.INTER_CUBIC,
            borderMode=cv2.BORDER_REPLICATE,
        )

        return deskewed_gray, M

    def _get_boxes_from_profile(
        self,
        binary_image: np.ndarray,
        stable_avg_char_width: float,
        min_space_factor: float,
        valley_threshold_factor: float,
    ) -> List:
        """
        Extracts word bounding boxes from vertical projection profile.
        """
        img_h, img_w = binary_image.shape
        vertical_projection = np.sum(binary_image, axis=0)
        peaks = vertical_projection[vertical_projection > 0]
        if len(peaks) == 0:
            return []
        avg_peak_height = np.mean(peaks)
        valley_threshold = int(avg_peak_height * valley_threshold_factor)
        min_space_width = int(stable_avg_char_width * min_space_factor)

        patched_projection = vertical_projection.copy()
        in_gap = False
        gap_start = 0

        for x, col_sum in enumerate(patched_projection):
            if col_sum <= valley_threshold and not in_gap:
                in_gap = True
                gap_start = x
            elif col_sum > valley_threshold and in_gap:
                in_gap = False
                if (x - gap_start) < min_space_width:
                    patched_projection[gap_start:x] = int(avg_peak_height)

        unlabeled_boxes = []
        in_word = False
        start_x = 0
        for x, col_sum in enumerate(patched_projection):
            if col_sum > valley_threshold and not in_word:
                start_x = x
                in_word = True
            elif col_sum <= valley_threshold and in_word:
                # [NOTE] Returns full height stripe
                unlabeled_boxes.append((start_x, 0, x - start_x, img_h))
                in_word = False
        if in_word:
            unlabeled_boxes.append((start_x, 0, img_w - start_x, img_h))
        return unlabeled_boxes

    def _enforce_logical_constraints(
        self, output: Dict[str, List], image_width: int, image_height: int
    ) -> Dict[str, List]:
        """
        Enforces geometric sanity checks with 2D awareness.
        """
        if not output or not output["text"]:
            return output

        num_items = len(output["text"])
        boxes = []
        for i in range(num_items):
            boxes.append(
                {
                    "text": output["text"][i],
                    "left": int(output["left"][i]),
                    "top": int(output["top"][i]),
                    "width": int(output["width"][i]),
                    "height": int(output["height"][i]),
                    "conf": output["conf"][i],
                }
            )

        valid_boxes = []
        for box in boxes:
            x0 = max(0, box["left"])
            y0 = max(0, box["top"])
            x1 = min(image_width, box["left"] + box["width"])
            y1 = min(image_height, box["top"] + box["height"])

            w = x1 - x0
            h = y1 - y0

            if w > 0 and h > 0:
                box["left"] = x0
                box["top"] = y0
                box["width"] = w
                box["height"] = h
                valid_boxes.append(box)
        boxes = valid_boxes

        is_vertical = image_height > (image_width * 1.2)
        if is_vertical:
            boxes.sort(key=lambda b: (b["top"], b["left"]))
        else:
            boxes.sort(key=lambda b: (b["left"], -b["width"]))

        final_pass_boxes = []
        if boxes:
            keep_indices = [True] * len(boxes)
            for i in range(len(boxes)):
                for j in range(len(boxes)):
                    if i == j:
                        continue
                    b1 = boxes[i]
                    b2 = boxes[j]

                    x_nested = (b1["left"] >= b2["left"] - 2) and (
                        b1["left"] + b1["width"] <= b2["left"] + b2["width"] + 2
                    )
                    y_nested = (b1["top"] >= b2["top"] - 2) and (
                        b1["top"] + b1["height"] <= b2["top"] + b2["height"] + 2
                    )

                    if x_nested and y_nested:
                        if b1["text"] == b2["text"]:
                            if b1["width"] * b1["height"] <= b2["width"] * b2["height"]:
                                keep_indices[i] = False

            for i, keep in enumerate(keep_indices):
                if keep:
                    final_pass_boxes.append(boxes[i])

        boxes = final_pass_boxes

        if is_vertical:
            boxes.sort(key=lambda b: (b["top"], b["left"]))
        else:
            boxes.sort(key=lambda b: (b["left"], -b["width"]))

        for i in range(len(boxes)):
            for j in range(i + 1, len(boxes)):
                b1 = boxes[i]
                b2 = boxes[j]

                x_overlap = min(
                    b1["left"] + b1["width"], b2["left"] + b2["width"]
                ) - max(b1["left"], b2["left"])
                y_overlap = min(
                    b1["top"] + b1["height"], b2["top"] + b2["height"]
                ) - max(b1["top"], b2["top"])

                if x_overlap > 0 and y_overlap > 0:
                    if is_vertical:
                        if b1["top"] < b2["top"]:
                            new_h = max(1, b2["top"] - b1["top"])
                            b1["height"] = new_h
                    else:
                        if b1["left"] < b2["left"]:
                            b1_right = b1["left"] + b1["width"]
                            b2_right = b2["left"] + b2["width"]
                            left_slice_width = max(0, b2["left"] - b1["left"])
                            right_slice_width = max(0, b1_right - b2_right)

                            if (
                                b1_right > b2_right
                                and right_slice_width > left_slice_width
                            ):
                                b1["left"] = b2_right
                                b1["width"] = right_slice_width
                            else:
                                b1["width"] = max(1, left_slice_width)

        cleaned_output = {
            k: [] for k in ["text", "left", "top", "width", "height", "conf"]
        }
        if is_vertical:
            boxes.sort(key=lambda b: (b["top"], b["left"]))
        else:
            boxes.sort(key=lambda b: (b["left"], -b["width"]))

        for box in boxes:
            for key in cleaned_output.keys():
                cleaned_output[key].append(box[key])

        return cleaned_output

    def _is_geometry_valid(
        self,
        boxes: List[Tuple[int, int, int, int]],
        words: List[str],
        expected_height: float = 0,
    ) -> bool:
        """
        Validates if the detected boxes are physically plausible.
        [FIX] Improved robustness for punctuation and mixed-case text.
        """
        if len(boxes) != len(words):
            return False

        baseline = expected_height
        # Use median only if provided expected height is unreliable
        if baseline < 5:
            heights = [b[3] for b in boxes]
            if heights:
                baseline = np.median(heights)

        if baseline < 5:
            return True

        for i, box in enumerate(boxes):
            word = words[i]

            # [FIX] Check for punctuation/symbols. They are allowed to be small.
            # If word is just punctuation, skip geometry checks
            is_punctuation = not any(c.isalnum() for c in word)
            if is_punctuation:
                continue

            # Standard checks for alphanumeric words
            num_chars = len(word)
            if num_chars < 1:
                continue

            width = box[2]
            height = box[3]

            # [FIX] Only reject height if it's REALLY small compared to baseline
            # A period might be small, but we skipped that check above.
            # This check ensures a real word like "The" isn't 2 pixels tall.
            if height < (baseline * 0.20):
                return False

            avg_char_width = width / num_chars
            min_expected = baseline * 0.20

            # Only reject if it fails BOTH absolute (4px) and relative checks
            if avg_char_width < min_expected and avg_char_width < 4:
                # Exception: If the word is 1 char long (e.g. "I", "l", "1"), allow it to be skinny.
                if num_chars == 1 and avg_char_width >= 2:
                    continue
                return False

        return True

    def segment(
        self,
        line_data: Dict[str, List],
        line_image: np.ndarray,
        min_space_factor=MIN_SPACE_FACTOR,
        match_tolerance=MATCH_TOLERANCE,
        image_name: str = None,
    ) -> Tuple[Dict[str, List], bool]:

        if (
            line_image is None
            or not isinstance(line_image, np.ndarray)
            or line_image.size == 0
        ):
            return ({}, False)
        # Allow grayscale (2 dims) or color (3 dims)
        if len(line_image.shape) < 2:
            return ({}, False)
        if not line_data or not line_data.get("text") or len(line_data["text"]) == 0:
            return ({}, False)

        line_text = line_data["text"][0]
        words = line_text.split()

        # Early return if 1 or fewer words
        if len(words) <= 1:
            img_h, img_w = line_image.shape[:2]
            one_word_result = self.fallback_segmenter.convert_line_to_word_level(
                line_data, img_w, img_h
            )
            return (one_word_result, False)

        line_number = line_data["line"][0]
        safe_image_name = _sanitize_filename(image_name or "image", max_length=50)
        safe_line_number = _sanitize_filename(str(line_number), max_length=10)
        safe_shortened_line_text = _sanitize_filename(line_text, max_length=10)

        if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
            os.makedirs(self.output_folder, exist_ok=True)
            output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_original.png"
            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
            cv2.imwrite(output_path, line_image)

        if len(line_image.shape) == 3:
            gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)
        else:
            gray = line_image.copy()

        # ========================================================================
        # IMAGE PREPROCESSING (Deskew / Rotate)
        # ========================================================================
        oriented_gray, M_orient = self._correct_orientation(gray)
        deskewed_gray, M_skew = self._deskew_image(oriented_gray)

        # Combine matrices: M_total = M_skew * M_orient
        M_orient_3x3 = np.vstack([M_orient, [0, 0, 1]])
        M_skew_3x3 = np.vstack([M_skew, [0, 0, 1]])
        M_total_3x3 = M_skew_3x3 @ M_orient_3x3
        M = M_total_3x3[0:2, :]  # Extract 2x3 affine matrix

        # Apply transformation to the original color image
        h, w = deskewed_gray.shape
        deskewed_line_image = cv2.warpAffine(
            line_image,
            M,
            (w, h),
            flags=cv2.INTER_CUBIC,
            borderMode=cv2.BORDER_REPLICATE,
        )

        # [FIX] Create Local Line Data that matches the deskewed/rotated image dimensions.
        # This prevents the fallback segmenter from using vertical dimensions on a horizontal image.
        local_line_data = {
            "text": line_data["text"],
            "conf": line_data["conf"],
            "left": [0],  # Local coordinate system starts at 0
            "top": [0],
            "width": [w],  # Use the ROTATED width
            "height": [h],  # Use the ROTATED height
            "line": line_data.get("line", [0]),
        }

        if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
            os.makedirs(self.output_folder, exist_ok=True)
            output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_deskewed.png"
            cv2.imwrite(output_path, deskewed_line_image)

        # ========================================================================
        # MAIN SEGMENTATION PIPELINE
        # ========================================================================
        approx_char_count = len(line_data["text"][0].replace(" ", ""))
        if approx_char_count == 0:
            return {}, False

        img_h, img_w = deskewed_gray.shape
        estimated_char_height = img_h * 0.6
        avg_char_width_approx = img_w / approx_char_count

        block_size = int(avg_char_width_approx * BLOCK_SIZE_FACTOR)
        if block_size % 2 == 0:
            block_size += 1
        if block_size < 3:
            block_size = 3

        # --- Binarization ---
        binary_adaptive = cv2.adaptiveThreshold(
            deskewed_gray,
            255,
            cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY_INV,
            block_size,
            C_VALUE,
        )
        otsu_thresh_val, _ = cv2.threshold(
            deskewed_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        )
        strict_thresh_val = otsu_thresh_val * 0.75
        _, binary_strict = cv2.threshold(
            deskewed_gray, strict_thresh_val, 255, cv2.THRESH_BINARY_INV
        )
        binary = cv2.bitwise_and(binary_adaptive, binary_strict)

        if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
            output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_binary.png"
            cv2.imwrite(output_path, binary)

        # --- Morphological Closing ---
        morph_width = max(3, int(avg_char_width_approx * 0.40))
        morph_height = max(2, int(avg_char_width_approx * 0.1))
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (morph_width, morph_height))
        closed_binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)

        # --- Noise Removal ---
        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
            closed_binary, 8, cv2.CV_32S
        )
        clean_binary = np.zeros_like(binary)

        force_fallback = False
        significant_labels = 0
        if num_labels > 1:
            # Only count components with area > 3 pixels
            significant_labels = np.sum(stats[1:, cv2.CC_STAT_AREA] > 3)

        if approx_char_count > 0 and significant_labels > (approx_char_count * 12):
            force_fallback = True

        if num_labels > 1:
            areas = stats[1:, cv2.CC_STAT_AREA]
            if len(areas) == 0:
                clean_binary = binary
                areas = np.array([0])
            else:
                p1 = np.percentile(areas, 1)
                img_h, img_w = binary.shape
                estimated_char_height = img_h * 0.7
                estimated_min_letter_area = max(
                    2, int(estimated_char_height * 0.2 * estimated_char_height * 0.15)
                )
                area_threshold = max(
                    MIN_AREA_THRESHOLD, min(p1, estimated_min_letter_area)
                )

                # Gap detection logic...
                sorted_areas = np.sort(areas)
                area_diffs = np.diff(sorted_areas)
                if len(sorted_areas) > 10 and len(area_diffs) > 0:
                    jump_threshold = np.percentile(area_diffs, 95)
                    significant_jump_thresh = max(10, jump_threshold * 3)
                    jump_indices = np.where(area_diffs > significant_jump_thresh)[0]
                    if len(jump_indices) > 0:
                        gap_idx = jump_indices[0]
                        area_before_gap = sorted_areas[gap_idx]
                        final_threshold = max(area_before_gap + 1, area_threshold)
                        final_threshold = min(final_threshold, 15)
                        area_threshold = final_threshold

                for i in range(1, num_labels):
                    if stats[i, cv2.CC_STAT_AREA] >= area_threshold:
                        clean_binary[labels == i] = 255
        else:
            clean_binary = binary

        # --- Vertical Cropping ---
        horizontal_projection = np.sum(clean_binary, axis=1)
        y_start = 0
        non_zero_rows = np.where(horizontal_projection > 0)[0]
        if len(non_zero_rows) > 0:
            p_top = int(np.percentile(non_zero_rows, 5))
            p_bottom = int(np.percentile(non_zero_rows, 95))
            core_height = p_bottom - p_top
            trim_pixels = int(core_height * 0.1)
            y_start = max(0, p_top + trim_pixels)
            y_end = min(clean_binary.shape[0], p_bottom - trim_pixels)
            if y_end - y_start < 5:
                y_start = p_top
                y_end = p_bottom
            analysis_image = clean_binary[y_start:y_end, :]
        else:
            analysis_image = clean_binary

        if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
            output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_line_number}_{safe_shortened_line_text}_clean_binary.png"
            cv2.imwrite(output_path, analysis_image)

        # --- Adaptive Search ---
        best_boxes = None
        successful_binary_image = None

        if not force_fallback:
            words = line_data["text"][0].split()
            target = len(words)
            backup_boxes_s1 = None

            # STAGE 1
            for v_factor in np.arange(INITIAL_VALLEY_THRESHOLD_FACTOR, 0.60, 0.02):
                curr_boxes = self._get_boxes_from_profile(
                    analysis_image, avg_char_width_approx, min_space_factor, v_factor
                )
                diff = abs(target - len(curr_boxes))
                is_geom_valid = self._is_geometry_valid(
                    curr_boxes, words, estimated_char_height
                )

                if diff == 0:
                    if is_geom_valid:
                        best_boxes = curr_boxes
                        successful_binary_image = analysis_image
                        break
                    else:
                        if backup_boxes_s1 is None:
                            backup_boxes_s1 = curr_boxes
                if diff == 1 and backup_boxes_s1 is None and is_geom_valid:
                    backup_boxes_s1 = curr_boxes

            # STAGE 2 (if needed)
            if best_boxes is None:
                backup_boxes_s2 = None
                for k_factor in np.arange(INITIAL_KERNEL_WIDTH_FACTOR, 0.5, 0.02):
                    k_w = max(1, int(avg_char_width_approx * k_factor))
                    s2_bin = cv2.morphologyEx(
                        clean_binary, cv2.MORPH_CLOSE, np.ones((1, k_w), np.uint8)
                    )
                    s2_img = (
                        s2_bin[y_start:y_end, :] if len(non_zero_rows) > 0 else s2_bin
                    )

                    if s2_img is None or s2_img.size == 0:
                        continue

                    curr_boxes = self._get_boxes_from_profile(
                        s2_img,
                        avg_char_width_approx,
                        min_space_factor,
                        MAIN_VALLEY_THRESHOLD_FACTOR,
                    )
                    diff = abs(target - len(curr_boxes))
                    is_geom_valid = self._is_geometry_valid(
                        curr_boxes, words, estimated_char_height
                    )

                    if diff == 0 and is_geom_valid:
                        best_boxes = curr_boxes
                        successful_binary_image = s2_bin
                        break

                    if diff == 1 and backup_boxes_s2 is None and is_geom_valid:
                        backup_boxes_s2 = curr_boxes

                if best_boxes is None:
                    if backup_boxes_s1 is not None:
                        best_boxes = backup_boxes_s1
                        successful_binary_image = analysis_image
                    elif backup_boxes_s2 is not None:
                        best_boxes = backup_boxes_s2
                        successful_binary_image = clean_binary

        final_output = None
        used_fallback = False

        if best_boxes is None:
            # --- FALLBACK WITH ROTATED DATA ---
            used_fallback = True
            # [FIX] Use local_line_data (rotated dims) instead of line_data (original dims)
            final_output = self.fallback_segmenter.refine_words_bidirectional(
                local_line_data, deskewed_line_image
            )
        else:
            # --- CCA Refinement ---
            unlabeled_boxes = best_boxes
            if successful_binary_image is analysis_image:
                cca_source_image = clean_binary
            else:
                cca_source_image = successful_binary_image

            num_labels, _, stats, _ = cv2.connectedComponentsWithStats(
                cca_source_image, 8, cv2.CV_32S
            )
            cca_img_h, cca_img_w = cca_source_image.shape[:2]

            component_assignments = {}
            num_proc = min(len(words), len(unlabeled_boxes))
            min_valid_component_area = estimated_char_height * 2

            for j in range(1, num_labels):
                comp_x = stats[j, cv2.CC_STAT_LEFT]
                comp_w = stats[j, cv2.CC_STAT_WIDTH]
                comp_area = stats[j, cv2.CC_STAT_AREA]
                comp_r = comp_x + comp_w
                comp_center_x = comp_x + comp_w / 2
                comp_y = stats[j, cv2.CC_STAT_TOP]
                comp_h = stats[j, cv2.CC_STAT_HEIGHT]
                comp_center_y = comp_y + comp_h / 2

                if comp_center_y < cca_img_h * 0.1 or comp_center_y > cca_img_h * 0.9:
                    continue
                if comp_area < min_valid_component_area:
                    continue

                best_box_idx = None
                max_overlap = 0
                best_center_distance = float("inf")
                component_center_in_box = False

                num_to_process = min(len(words), len(unlabeled_boxes))

                # Assign components to boxes...
                for i in range(
                    num_to_process
                ):  # Note: ensure num_to_process is defined
                    box_x, box_y, box_w, box_h = unlabeled_boxes[i]
                    box_r = box_x + box_w
                    box_center_x = box_x + box_w / 2

                    if comp_w > box_w * 1.5:
                        continue

                    if comp_x < box_r and box_x < comp_r:
                        overlap_start = max(comp_x, box_x)
                        overlap_end = min(comp_r, box_r)
                        overlap = overlap_end - overlap_start

                        if overlap > 0:
                            center_in_box = box_x <= comp_center_x < box_r
                            center_distance = abs(comp_center_x - box_center_x)

                            if center_in_box:
                                if not component_center_in_box or overlap > max_overlap:
                                    component_center_in_box = True
                                    best_center_distance = center_distance
                                    max_overlap = overlap
                                    best_box_idx = i
                            elif not component_center_in_box:
                                if center_distance < best_center_distance or (
                                    center_distance == best_center_distance
                                    and overlap > max_overlap
                                ):
                                    best_center_distance = center_distance
                                    max_overlap = overlap
                                    best_box_idx = i

                if best_box_idx is not None:
                    component_assignments[j] = best_box_idx

            refined_boxes_list = []
            for i in range(num_proc):
                word_label = words[i]
                components_in_box = [
                    stats[j] for j, b in component_assignments.items() if b == i
                ]

                use_original_box = False
                if not components_in_box:
                    use_original_box = True
                else:
                    min_x = min(c[cv2.CC_STAT_LEFT] for c in components_in_box)
                    min_y = min(c[cv2.CC_STAT_TOP] for c in components_in_box)
                    max_r = max(
                        c[cv2.CC_STAT_LEFT] + c[cv2.CC_STAT_WIDTH]
                        for c in components_in_box
                    )
                    max_b = max(
                        c[cv2.CC_STAT_TOP] + c[cv2.CC_STAT_HEIGHT]
                        for c in components_in_box
                    )
                    cca_h = max(1, max_b - min_y)
                    if cca_h < (estimated_char_height * 0.35):
                        use_original_box = True

                if use_original_box:
                    box_x, box_y, box_w, box_h = unlabeled_boxes[i]
                    adjusted_box_y = y_start + box_y
                    refined_boxes_list.append(
                        {
                            "text": word_label,
                            "left": box_x,
                            "top": adjusted_box_y,
                            "width": box_w,
                            "height": box_h,
                            "conf": line_data["conf"][0],
                        }
                    )
                else:
                    refined_boxes_list.append(
                        {
                            "text": word_label,
                            "left": min_x,
                            "top": min_y,
                            "width": max(1, max_r - min_x),
                            "height": cca_h,
                            "conf": line_data["conf"][0],
                        }
                    )

            # Check validity
            cca_check_list = [
                (b["left"], b["top"], b["width"], b["height"])
                for b in refined_boxes_list
            ]
            if not self._is_geometry_valid(
                cca_check_list, words, estimated_char_height
            ):
                if abs(len(refined_boxes_list) - len(words)) > 1:
                    best_boxes = None  # Trigger fallback
                else:
                    final_output = {
                        k: []
                        for k in ["text", "left", "top", "width", "height", "conf"]
                    }
                    for box in refined_boxes_list:
                        for key in final_output.keys():
                            final_output[key].append(box[key])
            else:
                final_output = {
                    k: [] for k in ["text", "left", "top", "width", "height", "conf"]
                }
                for box in refined_boxes_list:
                    for key in final_output.keys():
                        final_output[key].append(box[key])

        # --- REPEAT FALLBACK IF VALIDATION FAILED ---
        if best_boxes is None and not used_fallback:
            used_fallback = True
            # [FIX] Use local_line_data here too
            final_output = self.fallback_segmenter.refine_words_bidirectional(
                local_line_data, deskewed_line_image
            )

        # ========================================================================
        # COORDINATE TRANSFORMATION (Map back to Original)
        # ========================================================================
        M_inv = cv2.invertAffineTransform(M)
        remapped_boxes_list = []
        for i in range(len(final_output["text"])):
            left, top = final_output["left"][i], final_output["top"][i]
            width, height = final_output["width"][i], final_output["height"][i]

            # Map the 4 corners
            corners = np.array(
                [
                    [left, top],
                    [left + width, top],
                    [left + width, top + height],
                    [left, top + height],
                ],
                dtype="float32",
            )
            corners_expanded = np.expand_dims(corners, axis=1)
            original_corners = cv2.transform(corners_expanded, M_inv)
            squeezed_corners = original_corners.squeeze(axis=1)

            # Get axis aligned bounding box in original space
            min_x = int(np.min(squeezed_corners[:, 0]))
            max_x = int(np.max(squeezed_corners[:, 0]))
            min_y = int(np.min(squeezed_corners[:, 1]))
            max_y = int(np.max(squeezed_corners[:, 1]))

            remapped_boxes_list.append(
                {
                    "text": final_output["text"][i],
                    "left": min_x,
                    "top": min_y,
                    "width": max_x - min_x,
                    "height": max_y - min_y,
                    "conf": final_output["conf"][i],
                }
            )

        remapped_output = {k: [] for k in final_output.keys()}
        for box in remapped_boxes_list:
            for key in remapped_output.keys():
                remapped_output[key].append(box[key])

        img_h, img_w = line_image.shape[:2]
        remapped_output = self._enforce_logical_constraints(
            remapped_output, img_w, img_h
        )

        # ========================================================================
        # FINAL SAFETY NET
        # ========================================================================
        words = line_data["text"][0].split()
        target_count = len(words)
        current_count = len(remapped_output["text"])
        has_collapsed_boxes = any(w < 3 for w in remapped_output["width"])

        if current_count > 0:
            total_text_len = sum(len(t) for t in remapped_output["text"])
            total_box_width = sum(remapped_output["width"])
            avg_width_pixels = total_box_width / max(1, total_text_len)
        else:
            avg_width_pixels = 0
        is_suspiciously_thin = avg_width_pixels < 4

        if current_count != target_count or is_suspiciously_thin or has_collapsed_boxes:
            used_fallback = True

            # [FIX] Do NOT use original line_image/line_data here.
            # Use the local_line_data + deskewed_line_image pipeline,
            # then transform back using M_inv (same as above).

            # 1. Run fallback on rotated data
            temp_local_output = self.fallback_segmenter.refine_words_bidirectional(
                local_line_data, deskewed_line_image
            )

            # 2. If bidirectional failed to split correctly, use purely mathematical split on rotated data
            if len(temp_local_output["text"]) != target_count:
                h, w = deskewed_line_image.shape[:2]
                temp_local_output = self.fallback_segmenter.convert_line_to_word_level(
                    local_line_data, w, h
                )

            # 3. Transform the result back to original coordinates (M_inv)
            # (Repeating the transformation logic for the safety net result)
            remapped_boxes_list = []
            for i in range(len(temp_local_output["text"])):
                left, top = temp_local_output["left"][i], temp_local_output["top"][i]
                width, height = (
                    temp_local_output["width"][i],
                    temp_local_output["height"][i],
                )

                corners = np.array(
                    [
                        [left, top],
                        [left + width, top],
                        [left + width, top + height],
                        [left, top + height],
                    ],
                    dtype="float32",
                )
                corners_expanded = np.expand_dims(corners, axis=1)
                original_corners = cv2.transform(corners_expanded, M_inv)
                squeezed_corners = original_corners.squeeze(axis=1)

                min_x = int(np.min(squeezed_corners[:, 0]))
                max_x = int(np.max(squeezed_corners[:, 0]))
                min_y = int(np.min(squeezed_corners[:, 1]))
                max_y = int(np.max(squeezed_corners[:, 1]))

                remapped_boxes_list.append(
                    {
                        "text": temp_local_output["text"][i],
                        "left": min_x,
                        "top": min_y,
                        "width": max_x - min_x,
                        "height": max_y - min_y,
                        "conf": temp_local_output["conf"][i],
                    }
                )

            remapped_output = {k: [] for k in temp_local_output.keys()}
            for box in remapped_boxes_list:
                for key in remapped_output.keys():
                    remapped_output[key].append(box[key])

        if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
            output_path = f"{self.output_folder}/word_segmentation/{safe_image_name}_{safe_shortened_line_text}_final_boxes.png"
            os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
            output_image_vis = line_image.copy()
            for i in range(len(remapped_output["text"])):
                x, y, w, h = (
                    int(remapped_output["left"][i]),
                    int(remapped_output["top"][i]),
                    int(remapped_output["width"][i]),
                    int(remapped_output["height"][i]),
                )
                cv2.rectangle(output_image_vis, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.imwrite(output_path, output_image_vis)

        return remapped_output, used_fallback


class HybridWordSegmenter:
    """
    Implements a two-step approach for word segmentation:
    1. Proportional estimation based on text.
    2. Image-based refinement with a "Bounded Scan" to prevent
       over-correction.
    """

    def convert_line_to_word_level(
        self, line_data: Dict[str, List], image_width: int, image_height: int
    ) -> Dict[str, List]:
        """
        Step 1: Converts line-level OCR results to word-level by using a
        robust proportional estimation method.
        Guarantees output box count equals input word count.
        """
        output = {
            "text": list(),
            "left": list(),
            "top": list(),
            "width": list(),
            "height": list(),
            "conf": list(),
        }

        if not line_data or not line_data.get("text"):
            return output

        i = 0  # Assuming a single line
        line_text = line_data["text"][i]
        line_left = float(line_data["left"][i])
        line_top = float(line_data["top"][i])
        line_width = float(line_data["width"][i])
        line_height = float(line_data["height"][i])
        line_conf = line_data["conf"][i]

        if not line_text.strip():
            return output
        words = line_text.split()
        if not words:
            return output
        num_chars = len("".join(words))
        num_spaces = len(words) - 1
        if num_chars == 0:
            return output

        if (num_chars * 2 + num_spaces) > 0:
            char_space_ratio = 2.0
            estimated_space_width = line_width / (
                num_chars * char_space_ratio + num_spaces
            )
            avg_char_width = estimated_space_width * char_space_ratio
        else:
            avg_char_width = line_width / (num_chars if num_chars > 0 else 1)
            estimated_space_width = avg_char_width

        # [SAFETY CHECK] Ensure we never estimate a character width of ~0
        avg_char_width = max(3.0, avg_char_width)
        min_word_width = max(5.0, avg_char_width * 0.5)

        current_left = line_left
        for word in words:
            raw_word_width = len(word) * avg_char_width

            # Force the box to have a legible size
            word_width = max(min_word_width, raw_word_width)

            clamped_left = max(0, min(current_left, image_width))
            # We do NOT clamp the width against image_width here because that
            # causes the "0 width" bug if current_left is at the edge.
            # It is better to have a box go off-screen than be 0-width.

            output["text"].append(word)
            output["left"].append(clamped_left)
            output["top"].append(line_top)
            output["width"].append(word_width)
            output["height"].append(line_height)
            output["conf"].append(line_conf)
            current_left += word_width + estimated_space_width

        return output

    def _run_single_pass(
        self,
        initial_boxes: List[Dict],
        vertical_projection: np.ndarray,
        max_scan_distance: int,
        img_w: int,
        direction: str = "ltr",
    ) -> List[Dict]:
        """
        Helper function to run one pass of refinement.
        IMPROVED: Uses local minima detection for cursive script where
        perfect zero-gaps (white space) might not exist.
        """

        refined_boxes = [box.copy() for box in initial_boxes]

        if direction == "ltr":
            last_corrected_right_edge = 0
            indices = range(len(refined_boxes))
        else:  # rtl
            next_corrected_left_edge = img_w
            indices = range(len(refined_boxes) - 1, -1, -1)

        for i in indices:
            box = refined_boxes[i]
            left = int(box["left"])
            right = int(box["left"] + box["width"])

            left = max(0, min(left, img_w - 1))
            right = max(0, min(right, img_w - 1))

            new_left, new_right = left, right

            # --- Boundary search with improved gap detection ---
            # Priority 1: True gap (zero projection)
            # Priority 2: Valley with lowest ink density (thinnest connection)

            if direction == "ltr" or direction == "both":  # Scan right logic
                if right < img_w:
                    scan_limit = min(img_w, right + max_scan_distance)
                    search_range = range(right, scan_limit)

                    best_x = right
                    min_density = float("inf")
                    found_zero = False

                    # Look for the best cut in the window
                    for x in search_range:
                        density = vertical_projection[x]
                        if density == 0:
                            new_right = x
                            found_zero = True
                            break
                        if density < min_density:
                            min_density = density
                            best_x = x

                    if not found_zero:
                        # No clear gap found, cut at thinnest point (minimum density)
                        new_right = best_x

            if direction == "rtl" or direction == "both":  # Scan left logic
                if left > 0:
                    scan_limit = max(0, left - max_scan_distance)
                    search_range = range(left, scan_limit, -1)

                    best_x = left
                    min_density = float("inf")
                    found_zero = False

                    for x in search_range:
                        density = vertical_projection[x]
                        if density == 0:
                            new_left = x
                            found_zero = True
                            break
                        if density < min_density:
                            min_density = density
                            best_x = x

                    if not found_zero:
                        new_left = best_x

            # --- Directional de-overlapping (strict stitching) ---
            if direction == "ltr":
                if new_left < last_corrected_right_edge:
                    new_left = last_corrected_right_edge
                # Ensure valid width
                if new_right <= new_left:
                    new_right = new_left + 1
                last_corrected_right_edge = new_right
            else:  # rtl
                if new_right > next_corrected_left_edge:
                    new_right = next_corrected_left_edge
                # Ensure valid width
                if new_left >= new_right:
                    new_left = new_right - 1
                next_corrected_left_edge = new_left

            box["left"] = new_left
            box["width"] = max(1, new_right - new_left)

        return refined_boxes

    def refine_words_bidirectional(
        self,
        line_data: Dict[str, List],
        line_image: np.ndarray,
    ) -> Dict[str, List]:
        """
        Refines boxes using a more robust bidirectional scan and averaging.
        Includes ADAPTIVE NOISE REMOVAL to filter specks based on font size.
        """
        if line_image is None:
            return line_data

        # Early return if 1 or fewer words
        if line_data and line_data.get("text"):
            words = line_data["text"][0].split()
            if len(words) <= 1:
                img_h, img_w = line_image.shape[:2]
                return self.convert_line_to_word_level(line_data, img_w, img_h)

        # --- PRE-PROCESSING: Stricter Binarization ---
        gray = cv2.cvtColor(line_image, cv2.COLOR_BGR2GRAY)

        # 1. Calculate standard Otsu threshold first
        otsu_thresh_val, _ = cv2.threshold(
            gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU
        )

        # 2. Apply "Strictness Factor" to remove dark noise
        # 0.75 means "Only keep pixels that are in the darkest 75% of what Otsu thought was foreground"
        # This effectively filters out light-gray noise shadows.
        strict_thresh_val = otsu_thresh_val * 0.75
        _, binary = cv2.threshold(gray, strict_thresh_val, 255, cv2.THRESH_BINARY_INV)

        img_h, img_w = binary.shape

        # [NEW STEP 1] Morphological Opening
        # Physically erodes small protrusions and dust (2x2 pixels or smaller)
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
        binary_clean = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)

        # [NEW STEP 2] Adaptive Component Filtering
        # Instead of hardcoded pixels, we filter relative to the line's text size.
        num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
            binary_clean, 8, cv2.CV_32S
        )

        # Get heights of all components (excluding background)
        heights = stats[1:, cv2.CC_STAT_HEIGHT]

        if len(heights) > 0:
            # Calculate Median Height of "significant" parts (ignore tiny noise for the median calculation)
            # We assume valid text is at least 20% of the image height
            significant_heights = heights[heights > img_h * 0.2]
            if len(significant_heights) > 0:
                median_h = np.median(significant_heights)
            else:
                median_h = np.median(heights)

            # Define Thresholds based on Text Size
            # 1. Main Threshold: Keep parts taller than 30% of median letter height
            min_height_thresh = median_h * 0.30

            clean_binary = np.zeros_like(binary)
            for i in range(1, num_labels):
                h = stats[i, cv2.CC_STAT_HEIGHT]
                w = stats[i, cv2.CC_STAT_WIDTH]
                area = stats[i, cv2.CC_STAT_AREA]

                # Logic: Keep the component IF:
                # A. It is tall enough to be a letter part (h > threshold)
                # B. OR it is a "Dot" (Period / i-dot):
                #    - Height is small (< threshold)
                #    - Width is ALSO small (roughly square, prevents flat dash/scratch noise)
                #    - Area is reasonable (> 2px)

                is_tall_enough = h > min_height_thresh
                is_dot = (
                    (h <= min_height_thresh) and (w <= min_height_thresh) and (area > 2)
                )

                if is_tall_enough or is_dot:
                    clean_binary[labels == i] = 255

            # Use the adaptively cleaned image for projection
            vertical_projection = np.sum(clean_binary, axis=0)
        else:
            # Fallback if no components found (unlikely)
            vertical_projection = np.sum(binary, axis=0)

        # --- Rest of logic remains the same ---
        char_blobs = []
        in_blob = False
        blob_start = 0
        for x, col_sum in enumerate(vertical_projection):
            if col_sum > 0 and not in_blob:
                blob_start = x
                in_blob = True
            elif col_sum == 0 and in_blob:
                char_blobs.append((blob_start, x))
                in_blob = False
        if in_blob:
            char_blobs.append((blob_start, img_w))

        if not char_blobs:
            return self.convert_line_to_word_level(line_data, img_w, img_h)

        # [PREVIOUS FIX] Bounded Scan Distance
        total_chars = len("".join(words))
        if total_chars > 0:
            geom_avg_char_width = img_w / total_chars
        else:
            geom_avg_char_width = 10

        blob_avg_char_width = np.mean([end - start for start, end in char_blobs])
        safe_avg_char_width = min(blob_avg_char_width, geom_avg_char_width * 1.5)
        max_scan_distance = int(safe_avg_char_width * 2.0)

        # [PREVIOUS FIX] Safety Floor
        min_safe_box_width = max(4, int(safe_avg_char_width * 0.5))

        estimated_data = self.convert_line_to_word_level(line_data, img_w, img_h)
        if not estimated_data["text"]:
            return estimated_data

        initial_boxes = []
        for i in range(len(estimated_data["text"])):
            initial_boxes.append(
                {
                    "text": estimated_data["text"][i],
                    "left": estimated_data["left"][i],
                    "top": estimated_data["top"][i],
                    "width": estimated_data["width"][i],
                    "height": estimated_data["height"][i],
                    "conf": estimated_data["conf"][i],
                }
            )

        # --- STEP 1 & 2: Perform bidirectional refinement passes ---
        ltr_boxes = self._run_single_pass(
            initial_boxes, vertical_projection, max_scan_distance, img_w, "ltr"
        )
        rtl_boxes = self._run_single_pass(
            initial_boxes, vertical_projection, max_scan_distance, img_w, "rtl"
        )

        # --- STEP 3: Combine results using best edge from each pass ---
        combined_boxes = [box.copy() for box in initial_boxes]
        for i in range(len(combined_boxes)):
            final_left = ltr_boxes[i]["left"]
            rtl_right = rtl_boxes[i]["left"] + rtl_boxes[i]["width"]

            combined_boxes[i]["left"] = final_left
            combined_boxes[i]["width"] = max(min_safe_box_width, rtl_right - final_left)

        # --- STEP 4: Contiguous stitching to eliminate gaps ---
        for i in range(len(combined_boxes) - 1):
            if combined_boxes[i + 1]["left"] <= combined_boxes[i]["left"]:
                combined_boxes[i + 1]["left"] = (
                    combined_boxes[i]["left"] + min_safe_box_width
                )

        for i in range(len(combined_boxes) - 1):
            curr = combined_boxes[i]
            nxt = combined_boxes[i + 1]
            gap_width = nxt["left"] - curr["left"]
            curr["width"] = max(min_safe_box_width, gap_width)

        # Convert back to output dict
        final_output = {k: [] for k in estimated_data.keys()}
        for box in combined_boxes:
            if box["width"] >= min_safe_box_width:
                for key in final_output.keys():
                    final_output[key].append(box[key])

        return final_output