decipherai-api / services /layout_parser.py
Akshay30's picture
Initial DecipherAI backend deployment
2f4af3f
import cv2
import numpy as np
from PIL import Image
from typing import List, Dict, Tuple
class LayoutParser:
def __init__(self):
pass
def analyze_layout(self, image_path: str) -> Dict:
"""Analyze document image layout to detect columns, blocks, and lines of text"""
try:
img = cv2.imread(image_path)
if img is None:
raise FileNotFoundError(f"Image not found: {image_path}")
h_img, w_img, _ = img.shape
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Step 1: Preprocess to remove noise and binarize
# Use Otsu's thresholding after Gaussian blur
blur = cv2.GaussianBlur(gray, (5, 5), 0)
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Step 2: Dilation to merge words into horizontal line segments
# Use larger horizontal kernel to join words along text lines
line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
dilated = cv2.dilate(thresh, line_kernel, iterations=2)
# Step 3: Find contours of lines
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
lines = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
# Filter out small noise and full page boundaries
if w < 15 or h < 5:
continue
if w > w_img * 0.98 or h > h_img * 0.98:
continue
lines.append({
"box": (x, y, w, h),
"area": w * h
})
# Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
# We group lines into columns based on horizontal positions
lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first
columns = self._group_lines_into_columns(lines, w_img)
structured_layout = {
"width": w_img,
"height": h_img,
"column_count": len(columns),
"columns": columns
}
print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
return structured_layout
except Exception as e:
print(f"[ERROR] Layout parsing failed: {e}")
return {"width": 0, "height": 0, "column_count": 1, "columns": []}
def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
"""Group detected text lines into column blocks based on horizontal overlap"""
if not lines:
return []
# Find horizontal overlaps using a histogram projection
hist = np.zeros(page_width, dtype=np.int32)
for line in lines:
x, _, w, _ = line["box"]
hist[x:x+w] += 1
# Threshold histogram to find column boundaries
min_col_width = int(page_width * 0.1)
columns_x = []
in_col = False
start_x = 0
for x, val in enumerate(hist):
if val > 1 and not in_col:
in_col = True
start_x = x
elif val <= 1 and in_col:
in_col = False
end_x = x
if (end_x - start_x) >= min_col_width:
columns_x.append((start_x, end_x))
# Handle case where column stretches to the end
if in_col:
columns_x.append((start_x, page_width))
if not columns_x:
columns_x = [(0, page_width)]
# Assign lines to closest columns
cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
for line in lines:
x, y, w, h = line["box"]
line_center_x = x + w / 2
# Find the best column index
best_idx = 0
min_dist = page_width
for idx, col in enumerate(cols_data):
cx_start, cx_end = col["x_range"]
if cx_start <= line_center_x <= cx_end:
best_idx = idx
break
else:
dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
if dist < min_dist:
min_dist = dist
best_idx = idx
cols_data[best_idx]["lines"].append((x, y, w, h))
# Sort lines inside each column by vertical (y) coordinate
for col in cols_data:
col["lines"] = sorted(col["lines"], key=lambda box: box[1])
return cols_data
def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
"""Crop and return PIL images of detected text lines in reading order"""
try:
img = cv2.imread(image_path)
if img is None:
return []
crops = []
h_img, w_img, _ = img.shape
for col in layout.get("columns", []):
for (x, y, w, h) in col["lines"]:
# Add small padding for HTR/OCR context
pad_y = int(h * 0.1) + 2
pad_x = int(w * 0.05) + 2
y0 = max(0, y - pad_y)
y1 = min(h_img, y + h + pad_y)
x0 = max(0, x - pad_x)
x1 = min(w_img, x + w + pad_x)
crop = img[y0:y1, x0:x1]
if crop.size > 0:
crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
return crops
except Exception as e:
print(f"[ERROR] Failed to crop layout lines: {e}")
return []
def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
"""Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
try:
# 1. Try using CLIP classifier if provided
if clip_classifier and clip_classifier.model and clip_classifier.processor:
try:
from PIL import Image
image = Image.open(image_path).convert("RGB")
styles = ["printed", "cursive"]
descriptions = [
"classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
"medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
]
inputs = clip_classifier.processor(
text=descriptions,
images=image,
return_tensors="pt",
padding=True
).to(clip_classifier.device)
import torch
with torch.no_grad():
outputs = clip_classifier.model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
best_idx = np.argmax(probs)
style_label = styles[best_idx]
confidence = float(probs[best_idx])
print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
return style_label
except Exception as e:
print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
# 2. Fallback: Computer Vision heuristics
print("[INFO] Running computer vision heuristics for Latin style detection...")
img = cv2.imread(image_path)
if img is None:
return "cursive" # Safe default
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5, 5), 0)
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Find contours without heavy dilation (character level components)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
return "cursive"
aspect_ratios = []
widths = []
heights = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
# Filter noise
if w < 5 or h < 5:
continue
aspect_ratios.append(w / h)
widths.append(w)
heights.append(h)
if not aspect_ratios:
return "cursive"
avg_aspect_ratio = np.mean(aspect_ratios)
median_width = np.median(widths)
# Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
# Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
if avg_aspect_ratio < 1.3:
return "printed"
else:
return "cursive"
except Exception as e:
print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
return "cursive"