File size: 5,211 Bytes
c4ef1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9513cca
 
 
c4ef1cf
 
 
 
 
 
 
 
 
 
9513cca
 
 
c4ef1cf
 
 
 
 
 
 
 
 
 
 
9513cca
 
 
c4ef1cf
 
 
 
 
 
 
 
 
9513cca
 
 
 
 
 
c4ef1cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import annotations

from dataclasses import dataclass
from typing import Any, Dict, Tuple

import numpy as np
from PIL import Image


@dataclass(frozen=True)
class CropEmptyConfig:
    percentage_to_remove: float = 0.9
    remove_page_number: bool = False
    color_threshold: int = 240
    min_white_fraction: float = 0.99
    content_density_sides: float = 0.001
    content_density_main_text: float = 0.05
    content_density_any: float = 1e-6
    preserve_border_px: int = 1
    uniform_rowcol_std_threshold: float = 0.0


def crop_empty(
    image: Image.Image, *, config: CropEmptyConfig
) -> Tuple[Image.Image, Dict[str, Any]]:
    img = image.convert("RGB")
    arr = np.array(img)
    intensity = arr.mean(axis=2)

    def _find_border_start(axis: int, *, min_content_density_threshold: float) -> int:
        size = intensity.shape[axis]
        for i in range(size):
            pixels = intensity[i, :] if axis == 0 else intensity[:, i]
            white = float(np.mean(pixels > config.color_threshold))
            non_white = 1.0 - white
            if float(config.uniform_rowcol_std_threshold) > 0.0 and float(np.std(pixels)) <= float(
                config.uniform_rowcol_std_threshold
            ):
                continue
            if (white < config.min_white_fraction) and (non_white > min_content_density_threshold):
                return int(i)
        return int(size)

    def _find_border_end(axis: int, *, min_content_density_threshold: float) -> int:
        size = intensity.shape[axis]
        for i in range(size - 1, -1, -1):
            pixels = intensity[i, :] if axis == 0 else intensity[:, i]
            white = float(np.mean(pixels > config.color_threshold))
            non_white = 1.0 - white
            if float(config.uniform_rowcol_std_threshold) > 0.0 and float(np.std(pixels)) <= float(
                config.uniform_rowcol_std_threshold
            ):
                continue
            if (white < config.min_white_fraction) and (non_white > min_content_density_threshold):
                return int(i + 1)
        return 0

    top = _find_border_start(0, min_content_density_threshold=float(config.content_density_sides))
    left = _find_border_start(1, min_content_density_threshold=float(config.content_density_sides))
    right = _find_border_end(1, min_content_density_threshold=float(config.content_density_sides))

    main_text_end = _find_border_end(
        0, min_content_density_threshold=float(config.content_density_main_text)
    )
    last_content_end = _find_border_end(
        0, min_content_density_threshold=float(config.content_density_any)
    )
    bottom = main_text_end if config.remove_page_number else last_content_end

    width, height = img.size
    pad = max(int(getattr(config, "preserve_border_px", 0) or 0), 0)
    if pad > 0:
        left = max(int(left) - pad, 0)
        top = max(int(top) - pad, 0)
        right = min(int(right) + pad, int(width))
        bottom = min(int(bottom) + pad, int(height))
    crop_box = (int(left), int(top), int(right), int(bottom))
    valid = 0 <= crop_box[0] < crop_box[2] <= width and 0 <= crop_box[1] < crop_box[3] <= height

    if not valid:
        return image, {
            "applied": False,
            "crop_box": None,
            "original_width": int(width),
            "original_height": int(height),
            "cropped_width": int(width),
            "cropped_height": int(height),
            "config": {
                "percentage_to_remove": float(config.percentage_to_remove),
                "remove_page_number": bool(config.remove_page_number),
                "color_threshold": int(config.color_threshold),
                "min_white_fraction": float(config.min_white_fraction),
                "content_density_sides": float(config.content_density_sides),
                "content_density_main_text": float(config.content_density_main_text),
                "content_density_any": float(config.content_density_any),
                "preserve_border_px": int(config.preserve_border_px),
                "uniform_rowcol_std_threshold": float(config.uniform_rowcol_std_threshold),
            },
        }

    cropped = img.crop(crop_box)
    return cropped, {
        "applied": True,
        "crop_box": [int(crop_box[0]), int(crop_box[1]), int(crop_box[2]), int(crop_box[3])],
        "original_width": int(width),
        "original_height": int(height),
        "cropped_width": int(cropped.width),
        "cropped_height": int(cropped.height),
        "config": {
            "percentage_to_remove": float(config.percentage_to_remove),
            "remove_page_number": bool(config.remove_page_number),
            "color_threshold": int(config.color_threshold),
            "min_white_fraction": float(config.min_white_fraction),
            "content_density_sides": float(config.content_density_sides),
            "content_density_main_text": float(config.content_density_main_text),
            "content_density_any": float(config.content_density_any),
            "preserve_border_px": int(config.preserve_border_px),
            "uniform_rowcol_std_threshold": float(config.uniform_rowcol_std_threshold),
        },
    }