Spaces:

Akshay30
/

decipherai-api

Sleeping

File size: 10,159 Bytes

2f4af3f

import cv2
import numpy as np
from PIL import Image
from typing import List, Dict, Tuple

class LayoutParser:
    def __init__(self):
        pass

    def analyze_layout(self, image_path: str) -> Dict:
        """Analyze document image layout to detect columns, blocks, and lines of text"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                raise FileNotFoundError(f"Image not found: {image_path}")
            
            h_img, w_img, _ = img.shape
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            
            # Step 1: Preprocess to remove noise and binarize
            # Use Otsu's thresholding after Gaussian blur
            blur = cv2.GaussianBlur(gray, (5, 5), 0)
            _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            
            # Step 2: Dilation to merge words into horizontal line segments
            # Use larger horizontal kernel to join words along text lines
            line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
            dilated = cv2.dilate(thresh, line_kernel, iterations=2)
            
            # Step 3: Find contours of lines
            contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            lines = []
            for cnt in contours:
                x, y, w, h = cv2.boundingRect(cnt)
                
                # Filter out small noise and full page boundaries
                if w < 15 or h < 5:
                    continue
                if w > w_img * 0.98 or h > h_img * 0.98:
                    continue
                
                lines.append({
                    "box": (x, y, w, h),
                    "area": w * h
                })
            
            # Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
            # We group lines into columns based on horizontal positions
            lines = sorted(lines, key=lambda l: l["box"][1])  # sort by top coord first
            
            columns = self._group_lines_into_columns(lines, w_img)
            
            structured_layout = {
                "width": w_img,
                "height": h_img,
                "column_count": len(columns),
                "columns": columns
            }
            
            print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
            return structured_layout
            
        except Exception as e:
            print(f"[ERROR] Layout parsing failed: {e}")
            return {"width": 0, "height": 0, "column_count": 1, "columns": []}

    def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
        """Group detected text lines into column blocks based on horizontal overlap"""
        if not lines:
            return []
            
        # Find horizontal overlaps using a histogram projection
        hist = np.zeros(page_width, dtype=np.int32)
        for line in lines:
            x, _, w, _ = line["box"]
            hist[x:x+w] += 1
            
        # Threshold histogram to find column boundaries
        min_col_width = int(page_width * 0.1)
        columns_x = []
        in_col = False
        start_x = 0
        
        for x, val in enumerate(hist):
            if val > 1 and not in_col:
                in_col = True
                start_x = x
            elif val <= 1 and in_col:
                in_col = False
                end_x = x
                if (end_x - start_x) >= min_col_width:
                    columns_x.append((start_x, end_x))
                    
        # Handle case where column stretches to the end
        if in_col:
            columns_x.append((start_x, page_width))
            
        if not columns_x:
            columns_x = [(0, page_width)]
            
        # Assign lines to closest columns
        cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
        
        for line in lines:
            x, y, w, h = line["box"]
            line_center_x = x + w / 2
            
            # Find the best column index
            best_idx = 0
            min_dist = page_width
            for idx, col in enumerate(cols_data):
                cx_start, cx_end = col["x_range"]
                if cx_start <= line_center_x <= cx_end:
                    best_idx = idx
                    break
                else:
                    dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
                    if dist < min_dist:
                        min_dist = dist
                        best_idx = idx
            
            cols_data[best_idx]["lines"].append((x, y, w, h))
            
        # Sort lines inside each column by vertical (y) coordinate
        for col in cols_data:
            col["lines"] = sorted(col["lines"], key=lambda box: box[1])
            
        return cols_data

    def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
        """Crop and return PIL images of detected text lines in reading order"""
        try:
            img = cv2.imread(image_path)
            if img is None:
                return []
                
            crops = []
            h_img, w_img, _ = img.shape
            
            for col in layout.get("columns", []):
                for (x, y, w, h) in col["lines"]:
                    # Add small padding for HTR/OCR context
                    pad_y = int(h * 0.1) + 2
                    pad_x = int(w * 0.05) + 2
                    
                    y0 = max(0, y - pad_y)
                    y1 = min(h_img, y + h + pad_y)
                    x0 = max(0, x - pad_x)
                    x1 = min(w_img, x + w + pad_x)
                    
                    crop = img[y0:y1, x0:x1]
                    if crop.size > 0:
                        crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
            
            return crops
        except Exception as e:
            print(f"[ERROR] Failed to crop layout lines: {e}")
            return []

    def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
        """Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
        try:
            # 1. Try using CLIP classifier if provided
            if clip_classifier and clip_classifier.model and clip_classifier.processor:
                try:
                    from PIL import Image
                    image = Image.open(image_path).convert("RGB")
                    
                    styles = ["printed", "cursive"]
                    descriptions = [
                        "classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
                        "medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
                    ]
                    
                    inputs = clip_classifier.processor(
                        text=descriptions,
                        images=image,
                        return_tensors="pt",
                        padding=True
                    ).to(clip_classifier.device)
                    
                    import torch
                    with torch.no_grad():
                        outputs = clip_classifier.model(**inputs)
                        logits_per_image = outputs.logits_per_image
                        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
                        
                    best_idx = np.argmax(probs)
                    style_label = styles[best_idx]
                    confidence = float(probs[best_idx])
                    print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
                    return style_label
                except Exception as e:
                    print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
            
            # 2. Fallback: Computer Vision heuristics
            print("[INFO] Running computer vision heuristics for Latin style detection...")
            img = cv2.imread(image_path)
            if img is None:
                return "cursive"  # Safe default
                
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            blur = cv2.GaussianBlur(gray, (5, 5), 0)
            _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
            
            # Find contours without heavy dilation (character level components)
            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            
            if not contours:
                return "cursive"
                
            aspect_ratios = []
            widths = []
            heights = []
            
            for cnt in contours:
                x, y, w, h = cv2.boundingRect(cnt)
                # Filter noise
                if w < 5 or h < 5:
                    continue
                aspect_ratios.append(w / h)
                widths.append(w)
                heights.append(h)
                
            if not aspect_ratios:
                return "cursive"
                
            avg_aspect_ratio = np.mean(aspect_ratios)
            median_width = np.median(widths)
            
            # Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
            # Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
            print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
            
            if avg_aspect_ratio < 1.3:
                return "printed"
            else:
                return "cursive"
                
        except Exception as e:
            print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
            return "cursive"