Spaces:
Sleeping
Sleeping
| import cv2 | |
| import numpy as np | |
| from PIL import Image | |
| from typing import List, Dict, Tuple | |
| class LayoutParser: | |
| def __init__(self): | |
| pass | |
| def analyze_layout(self, image_path: str) -> Dict: | |
| """Analyze document image layout to detect columns, blocks, and lines of text""" | |
| try: | |
| img = cv2.imread(image_path) | |
| if img is None: | |
| raise FileNotFoundError(f"Image not found: {image_path}") | |
| h_img, w_img, _ = img.shape | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # Step 1: Preprocess to remove noise and binarize | |
| # Use Otsu's thresholding after Gaussian blur | |
| blur = cv2.GaussianBlur(gray, (5, 5), 0) | |
| _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) | |
| # Step 2: Dilation to merge words into horizontal line segments | |
| # Use larger horizontal kernel to join words along text lines | |
| line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3)) | |
| dilated = cv2.dilate(thresh, line_kernel, iterations=2) | |
| # Step 3: Find contours of lines | |
| contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| lines = [] | |
| for cnt in contours: | |
| x, y, w, h = cv2.boundingRect(cnt) | |
| # Filter out small noise and full page boundaries | |
| if w < 15 or h < 5: | |
| continue | |
| if w > w_img * 0.98 or h > h_img * 0.98: | |
| continue | |
| lines.append({ | |
| "box": (x, y, w, h), | |
| "area": w * h | |
| }) | |
| # Sort lines from top-to-bottom, left-to-right (handles multi-column layouts) | |
| # We group lines into columns based on horizontal positions | |
| lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first | |
| columns = self._group_lines_into_columns(lines, w_img) | |
| structured_layout = { | |
| "width": w_img, | |
| "height": h_img, | |
| "column_count": len(columns), | |
| "columns": columns | |
| } | |
| print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.") | |
| return structured_layout | |
| except Exception as e: | |
| print(f"[ERROR] Layout parsing failed: {e}") | |
| return {"width": 0, "height": 0, "column_count": 1, "columns": []} | |
| def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]: | |
| """Group detected text lines into column blocks based on horizontal overlap""" | |
| if not lines: | |
| return [] | |
| # Find horizontal overlaps using a histogram projection | |
| hist = np.zeros(page_width, dtype=np.int32) | |
| for line in lines: | |
| x, _, w, _ = line["box"] | |
| hist[x:x+w] += 1 | |
| # Threshold histogram to find column boundaries | |
| min_col_width = int(page_width * 0.1) | |
| columns_x = [] | |
| in_col = False | |
| start_x = 0 | |
| for x, val in enumerate(hist): | |
| if val > 1 and not in_col: | |
| in_col = True | |
| start_x = x | |
| elif val <= 1 and in_col: | |
| in_col = False | |
| end_x = x | |
| if (end_x - start_x) >= min_col_width: | |
| columns_x.append((start_x, end_x)) | |
| # Handle case where column stretches to the end | |
| if in_col: | |
| columns_x.append((start_x, page_width)) | |
| if not columns_x: | |
| columns_x = [(0, page_width)] | |
| # Assign lines to closest columns | |
| cols_data = [{"x_range": rx, "lines": []} for rx in columns_x] | |
| for line in lines: | |
| x, y, w, h = line["box"] | |
| line_center_x = x + w / 2 | |
| # Find the best column index | |
| best_idx = 0 | |
| min_dist = page_width | |
| for idx, col in enumerate(cols_data): | |
| cx_start, cx_end = col["x_range"] | |
| if cx_start <= line_center_x <= cx_end: | |
| best_idx = idx | |
| break | |
| else: | |
| dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end)) | |
| if dist < min_dist: | |
| min_dist = dist | |
| best_idx = idx | |
| cols_data[best_idx]["lines"].append((x, y, w, h)) | |
| # Sort lines inside each column by vertical (y) coordinate | |
| for col in cols_data: | |
| col["lines"] = sorted(col["lines"], key=lambda box: box[1]) | |
| return cols_data | |
| def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]: | |
| """Crop and return PIL images of detected text lines in reading order""" | |
| try: | |
| img = cv2.imread(image_path) | |
| if img is None: | |
| return [] | |
| crops = [] | |
| h_img, w_img, _ = img.shape | |
| for col in layout.get("columns", []): | |
| for (x, y, w, h) in col["lines"]: | |
| # Add small padding for HTR/OCR context | |
| pad_y = int(h * 0.1) + 2 | |
| pad_x = int(w * 0.05) + 2 | |
| y0 = max(0, y - pad_y) | |
| y1 = min(h_img, y + h + pad_y) | |
| x0 = max(0, x - pad_x) | |
| x1 = min(w_img, x + w + pad_x) | |
| crop = img[y0:y1, x0:x1] | |
| if crop.size > 0: | |
| crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))) | |
| return crops | |
| except Exception as e: | |
| print(f"[ERROR] Failed to crop layout lines: {e}") | |
| return [] | |
| def detect_writing_style(self, image_path: str, clip_classifier=None) -> str: | |
| """Detect if document image contains 'printed' capital letters or 'cursive' handwriting""" | |
| try: | |
| # 1. Try using CLIP classifier if provided | |
| if clip_classifier and clip_classifier.model and clip_classifier.processor: | |
| try: | |
| from PIL import Image | |
| image = Image.open(image_path).convert("RGB") | |
| styles = ["printed", "cursive"] | |
| descriptions = [ | |
| "classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters", | |
| "medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting" | |
| ] | |
| inputs = clip_classifier.processor( | |
| text=descriptions, | |
| images=image, | |
| return_tensors="pt", | |
| padding=True | |
| ).to(clip_classifier.device) | |
| import torch | |
| with torch.no_grad(): | |
| outputs = clip_classifier.model(**inputs) | |
| logits_per_image = outputs.logits_per_image | |
| probs = logits_per_image.softmax(dim=1).cpu().numpy()[0] | |
| best_idx = np.argmax(probs) | |
| style_label = styles[best_idx] | |
| confidence = float(probs[best_idx]) | |
| print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})") | |
| return style_label | |
| except Exception as e: | |
| print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.") | |
| # 2. Fallback: Computer Vision heuristics | |
| print("[INFO] Running computer vision heuristics for Latin style detection...") | |
| img = cv2.imread(image_path) | |
| if img is None: | |
| return "cursive" # Safe default | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| blur = cv2.GaussianBlur(gray, (5, 5), 0) | |
| _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) | |
| # Find contours without heavy dilation (character level components) | |
| contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| if not contours: | |
| return "cursive" | |
| aspect_ratios = [] | |
| widths = [] | |
| heights = [] | |
| for cnt in contours: | |
| x, y, w, h = cv2.boundingRect(cnt) | |
| # Filter noise | |
| if w < 5 or h < 5: | |
| continue | |
| aspect_ratios.append(w / h) | |
| widths.append(w) | |
| heights.append(h) | |
| if not aspect_ratios: | |
| return "cursive" | |
| avg_aspect_ratio = np.mean(aspect_ratios) | |
| median_width = np.median(widths) | |
| # Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2 | |
| # Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5 | |
| print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}") | |
| if avg_aspect_ratio < 1.3: | |
| return "printed" | |
| else: | |
| return "cursive" | |
| except Exception as e: | |
| print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.") | |
| return "cursive" | |