File size: 10,159 Bytes
2f4af3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import cv2
import numpy as np
from PIL import Image
from typing import List, Dict, Tuple

class LayoutParser:
    def __init__(self):
        pass

    def analyze_layout(self, image_path: str) -> Dict:
        """Analyze document image layout to detect columns, blocks, and lines of text"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                raise FileNotFoundError(f"Image not found: {image_path}")
            
            h_img, w_img, _ = img.shape
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Step 1: Preprocess to remove noise and binarize
            # Use Otsu's thresholding after Gaussian blur
            blur = cv2.GaussianBlur(gray, (5, 5), 0)
            _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            
            # Step 2: Dilation to merge words into horizontal line segments
            # Use larger horizontal kernel to join words along text lines
            line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
            dilated = cv2.dilate(thresh, line_kernel, iterations=2)
            
            # Step 3: Find contours of lines
            contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            lines = []
            for cnt in contours:
                x, y, w, h = cv2.boundingRect(cnt)
                
                # Filter out small noise and full page boundaries
                if w < 15 or h < 5:
                    continue
                if w > w_img * 0.98 or h > h_img * 0.98:
                    continue
                
                lines.append({
                    "box": (x, y, w, h),
                    "area": w * h
                })
            
            # Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
            # We group lines into columns based on horizontal positions
            lines = sorted(lines, key=lambda l: l["box"][1])  # sort by top coord first
            
            columns = self._group_lines_into_columns(lines, w_img)
            
            structured_layout = {
                "width": w_img,
                "height": h_img,
                "column_count": len(columns),
                "columns": columns
            }
            
            print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
            return structured_layout
            
        except Exception as e:
            print(f"[ERROR] Layout parsing failed: {e}")
            return {"width": 0, "height": 0, "column_count": 1, "columns": []}

    def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
        """Group detected text lines into column blocks based on horizontal overlap"""
        if not lines:
            return []
            
        # Find horizontal overlaps using a histogram projection
        hist = np.zeros(page_width, dtype=np.int32)
        for line in lines:
            x, _, w, _ = line["box"]
            hist[x:x+w] += 1
            
        # Threshold histogram to find column boundaries
        min_col_width = int(page_width * 0.1)
        columns_x = []
        in_col = False
        start_x = 0
        
        for x, val in enumerate(hist):
            if val > 1 and not in_col:
                in_col = True
                start_x = x
            elif val <= 1 and in_col:
                in_col = False
                end_x = x
                if (end_x - start_x) >= min_col_width:
                    columns_x.append((start_x, end_x))
                    
        # Handle case where column stretches to the end
        if in_col:
            columns_x.append((start_x, page_width))
            
        if not columns_x:
            columns_x = [(0, page_width)]
            
        # Assign lines to closest columns
        cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
        
        for line in lines:
            x, y, w, h = line["box"]
            line_center_x = x + w / 2
            
            # Find the best column index
            best_idx = 0
            min_dist = page_width
            for idx, col in enumerate(cols_data):
                cx_start, cx_end = col["x_range"]
                if cx_start <= line_center_x <= cx_end:
                    best_idx = idx
                    break
                else:
                    dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
                    if dist < min_dist:
                        min_dist = dist
                        best_idx = idx
            
            cols_data[best_idx]["lines"].append((x, y, w, h))
            
        # Sort lines inside each column by vertical (y) coordinate
        for col in cols_data:
            col["lines"] = sorted(col["lines"], key=lambda box: box[1])
            
        return cols_data

    def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
        """Crop and return PIL images of detected text lines in reading order"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                return []
                
            crops = []
            h_img, w_img, _ = img.shape
            
            for col in layout.get("columns", []):
                for (x, y, w, h) in col["lines"]:
                    # Add small padding for HTR/OCR context
                    pad_y = int(h * 0.1) + 2
                    pad_x = int(w * 0.05) + 2
                    
                    y0 = max(0, y - pad_y)
                    y1 = min(h_img, y + h + pad_y)
                    x0 = max(0, x - pad_x)
                    x1 = min(w_img, x + w + pad_x)
                    
                    crop = img[y0:y1, x0:x1]
                    if crop.size > 0:
                        crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
            
            return crops
        except Exception as e:
            print(f"[ERROR] Failed to crop layout lines: {e}")
            return []

    def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
        """Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
        try:
            # 1. Try using CLIP classifier if provided
            if clip_classifier and clip_classifier.model and clip_classifier.processor:
                try:
                    from PIL import Image
                    image = Image.open(image_path).convert("RGB")
                    
                    styles = ["printed", "cursive"]
                    descriptions = [
                        "classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
                        "medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
                    ]
                    
                    inputs = clip_classifier.processor(
                        text=descriptions,
                        images=image,
                        return_tensors="pt",
                        padding=True
                    ).to(clip_classifier.device)
                    
                    import torch
                    with torch.no_grad():
                        outputs = clip_classifier.model(**inputs)
                        logits_per_image = outputs.logits_per_image
                        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
                        
                    best_idx = np.argmax(probs)
                    style_label = styles[best_idx]
                    confidence = float(probs[best_idx])
                    print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
                    return style_label
                except Exception as e:
                    print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
            
            # 2. Fallback: Computer Vision heuristics
            print("[INFO] Running computer vision heuristics for Latin style detection...")
            img = cv2.imread(image_path)
            if img is None:
                return "cursive"  # Safe default
                
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            blur = cv2.GaussianBlur(gray, (5, 5), 0)
            _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            
            # Find contours without heavy dilation (character level components)
            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            if not contours:
                return "cursive"
                
            aspect_ratios = []
            widths = []
            heights = []
            
            for cnt in contours:
                x, y, w, h = cv2.boundingRect(cnt)
                # Filter noise
                if w < 5 or h < 5:
                    continue
                aspect_ratios.append(w / h)
                widths.append(w)
                heights.append(h)
                
            if not aspect_ratios:
                return "cursive"
                
            avg_aspect_ratio = np.mean(aspect_ratios)
            median_width = np.median(widths)
            
            # Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
            # Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
            print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
            
            if avg_aspect_ratio < 1.3:
                return "printed"
            else:
                return "cursive"
                
        except Exception as e:
            print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
            return "cursive"