import cv2 import numpy as np from PIL import Image from typing import List, Dict, Tuple class LayoutParser: def __init__(self): pass def analyze_layout(self, image_path: str) -> Dict: """Analyze document image layout to detect columns, blocks, and lines of text""" try: img = cv2.imread(image_path) if img is None: raise FileNotFoundError(f"Image not found: {image_path}") h_img, w_img, _ = img.shape gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Step 1: Preprocess to remove noise and binarize # Use Otsu's thresholding after Gaussian blur blur = cv2.GaussianBlur(gray, (5, 5), 0) _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Step 2: Dilation to merge words into horizontal line segments # Use larger horizontal kernel to join words along text lines line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3)) dilated = cv2.dilate(thresh, line_kernel, iterations=2) # Step 3: Find contours of lines contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) lines = [] for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) # Filter out small noise and full page boundaries if w < 15 or h < 5: continue if w > w_img * 0.98 or h > h_img * 0.98: continue lines.append({ "box": (x, y, w, h), "area": w * h }) # Sort lines from top-to-bottom, left-to-right (handles multi-column layouts) # We group lines into columns based on horizontal positions lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first columns = self._group_lines_into_columns(lines, w_img) structured_layout = { "width": w_img, "height": h_img, "column_count": len(columns), "columns": columns } print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.") return structured_layout except Exception as e: print(f"[ERROR] Layout parsing failed: {e}") return {"width": 0, "height": 0, "column_count": 1, "columns": []} def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]: """Group detected text lines into column blocks based on horizontal overlap""" if not lines: return [] # Find horizontal overlaps using a histogram projection hist = np.zeros(page_width, dtype=np.int32) for line in lines: x, _, w, _ = line["box"] hist[x:x+w] += 1 # Threshold histogram to find column boundaries min_col_width = int(page_width * 0.1) columns_x = [] in_col = False start_x = 0 for x, val in enumerate(hist): if val > 1 and not in_col: in_col = True start_x = x elif val <= 1 and in_col: in_col = False end_x = x if (end_x - start_x) >= min_col_width: columns_x.append((start_x, end_x)) # Handle case where column stretches to the end if in_col: columns_x.append((start_x, page_width)) if not columns_x: columns_x = [(0, page_width)] # Assign lines to closest columns cols_data = [{"x_range": rx, "lines": []} for rx in columns_x] for line in lines: x, y, w, h = line["box"] line_center_x = x + w / 2 # Find the best column index best_idx = 0 min_dist = page_width for idx, col in enumerate(cols_data): cx_start, cx_end = col["x_range"] if cx_start <= line_center_x <= cx_end: best_idx = idx break else: dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end)) if dist < min_dist: min_dist = dist best_idx = idx cols_data[best_idx]["lines"].append((x, y, w, h)) # Sort lines inside each column by vertical (y) coordinate for col in cols_data: col["lines"] = sorted(col["lines"], key=lambda box: box[1]) return cols_data def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]: """Crop and return PIL images of detected text lines in reading order""" try: img = cv2.imread(image_path) if img is None: return [] crops = [] h_img, w_img, _ = img.shape for col in layout.get("columns", []): for (x, y, w, h) in col["lines"]: # Add small padding for HTR/OCR context pad_y = int(h * 0.1) + 2 pad_x = int(w * 0.05) + 2 y0 = max(0, y - pad_y) y1 = min(h_img, y + h + pad_y) x0 = max(0, x - pad_x) x1 = min(w_img, x + w + pad_x) crop = img[y0:y1, x0:x1] if crop.size > 0: crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))) return crops except Exception as e: print(f"[ERROR] Failed to crop layout lines: {e}") return [] def detect_writing_style(self, image_path: str, clip_classifier=None) -> str: """Detect if document image contains 'printed' capital letters or 'cursive' handwriting""" try: # 1. Try using CLIP classifier if provided if clip_classifier and clip_classifier.model and clip_classifier.processor: try: from PIL import Image image = Image.open(image_path).convert("RGB") styles = ["printed", "cursive"] descriptions = [ "classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters", "medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting" ] inputs = clip_classifier.processor( text=descriptions, images=image, return_tensors="pt", padding=True ).to(clip_classifier.device) import torch with torch.no_grad(): outputs = clip_classifier.model(**inputs) logits_per_image = outputs.logits_per_image probs = logits_per_image.softmax(dim=1).cpu().numpy()[0] best_idx = np.argmax(probs) style_label = styles[best_idx] confidence = float(probs[best_idx]) print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})") return style_label except Exception as e: print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.") # 2. Fallback: Computer Vision heuristics print("[INFO] Running computer vision heuristics for Latin style detection...") img = cv2.imread(image_path) if img is None: return "cursive" # Safe default gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (5, 5), 0) _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Find contours without heavy dilation (character level components) contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if not contours: return "cursive" aspect_ratios = [] widths = [] heights = [] for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) # Filter noise if w < 5 or h < 5: continue aspect_ratios.append(w / h) widths.append(w) heights.append(h) if not aspect_ratios: return "cursive" avg_aspect_ratio = np.mean(aspect_ratios) median_width = np.median(widths) # Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2 # Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5 print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}") if avg_aspect_ratio < 1.3: return "printed" else: return "cursive" except Exception as e: print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.") return "cursive"