Spaces:

Akshay30
/

decipherai-api

Sleeping

App Files Files Community

decipherai-api / services /layout_parser.py

Akshay30

Initial DecipherAI backend deployment

2f4af3f 4 days ago

raw

history blame contribute delete

10.2 kB

	import cv2
	import numpy as np
	from PIL import Image
	from typing import List, Dict, Tuple

	class LayoutParser:
	def __init__(self):
	pass

	def analyze_layout(self, image_path: str) -> Dict:
	"""Analyze document image layout to detect columns, blocks, and lines of text"""
	try:
	img = cv2.imread(image_path)
	if img is None:
	raise FileNotFoundError(f"Image not found: {image_path}")

	h_img, w_img, _ = img.shape
	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Step 1: Preprocess to remove noise and binarize
	# Use Otsu's thresholding after Gaussian blur
	blur = cv2.GaussianBlur(gray, (5, 5), 0)
	_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

	# Step 2: Dilation to merge words into horizontal line segments
	# Use larger horizontal kernel to join words along text lines
	line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
	dilated = cv2.dilate(thresh, line_kernel, iterations=2)

	# Step 3: Find contours of lines
	contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	lines = []
	for cnt in contours:
	x, y, w, h = cv2.boundingRect(cnt)

	# Filter out small noise and full page boundaries
	if w < 15 or h < 5:
	continue
	if w > w_img * 0.98 or h > h_img * 0.98:
	continue

	lines.append({
	"box": (x, y, w, h),
	"area": w * h
	})

	# Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
	# We group lines into columns based on horizontal positions
	lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first

	columns = self._group_lines_into_columns(lines, w_img)

	structured_layout = {
	"width": w_img,
	"height": h_img,
	"column_count": len(columns),
	"columns": columns
	}

	print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
	return structured_layout

	except Exception as e:
	print(f"[ERROR] Layout parsing failed: {e}")
	return {"width": 0, "height": 0, "column_count": 1, "columns": []}

	def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
	"""Group detected text lines into column blocks based on horizontal overlap"""
	if not lines:
	return []

	# Find horizontal overlaps using a histogram projection
	hist = np.zeros(page_width, dtype=np.int32)
	for line in lines:
	x, _, w, _ = line["box"]
	hist[x:x+w] += 1

	# Threshold histogram to find column boundaries
	min_col_width = int(page_width * 0.1)
	columns_x = []
	in_col = False
	start_x = 0

	for x, val in enumerate(hist):
	if val > 1 and not in_col:
	in_col = True
	start_x = x
	elif val <= 1 and in_col:
	in_col = False
	end_x = x
	if (end_x - start_x) >= min_col_width:
	columns_x.append((start_x, end_x))

	# Handle case where column stretches to the end
	if in_col:
	columns_x.append((start_x, page_width))

	if not columns_x:
	columns_x = [(0, page_width)]

	# Assign lines to closest columns
	cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]

	for line in lines:
	x, y, w, h = line["box"]
	line_center_x = x + w / 2

	# Find the best column index
	best_idx = 0
	min_dist = page_width
	for idx, col in enumerate(cols_data):
	cx_start, cx_end = col["x_range"]
	if cx_start <= line_center_x <= cx_end:
	best_idx = idx
	break
	else:
	dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
	if dist < min_dist:
	min_dist = dist
	best_idx = idx

	cols_data[best_idx]["lines"].append((x, y, w, h))

	# Sort lines inside each column by vertical (y) coordinate
	for col in cols_data:
	col["lines"] = sorted(col["lines"], key=lambda box: box[1])

	return cols_data

	def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
	"""Crop and return PIL images of detected text lines in reading order"""
	try:
	img = cv2.imread(image_path)
	if img is None:
	return []

	crops = []
	h_img, w_img, _ = img.shape

	for col in layout.get("columns", []):
	for (x, y, w, h) in col["lines"]:
	# Add small padding for HTR/OCR context
	pad_y = int(h * 0.1) + 2
	pad_x = int(w * 0.05) + 2

	y0 = max(0, y - pad_y)
	y1 = min(h_img, y + h + pad_y)
	x0 = max(0, x - pad_x)
	x1 = min(w_img, x + w + pad_x)

	crop = img[y0:y1, x0:x1]
	if crop.size > 0:
	crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))

	return crops
	except Exception as e:
	print(f"[ERROR] Failed to crop layout lines: {e}")
	return []

	def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
	"""Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
	try:
	# 1. Try using CLIP classifier if provided
	if clip_classifier and clip_classifier.model and clip_classifier.processor:
	try:
	from PIL import Image
	image = Image.open(image_path).convert("RGB")

	styles = ["printed", "cursive"]
	descriptions = [
	"classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
	"medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
	]

	inputs = clip_classifier.processor(
	text=descriptions,
	images=image,
	return_tensors="pt",
	padding=True
	).to(clip_classifier.device)

	import torch
	with torch.no_grad():
	outputs = clip_classifier.model(**inputs)
	logits_per_image = outputs.logits_per_image
	probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]

	best_idx = np.argmax(probs)
	style_label = styles[best_idx]
	confidence = float(probs[best_idx])
	print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
	return style_label
	except Exception as e:
	print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")

	# 2. Fallback: Computer Vision heuristics
	print("[INFO] Running computer vision heuristics for Latin style detection...")
	img = cv2.imread(image_path)
	if img is None:
	return "cursive" # Safe default

	gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	blur = cv2.GaussianBlur(gray, (5, 5), 0)
	_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

	# Find contours without heavy dilation (character level components)
	contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

	if not contours:
	return "cursive"

	aspect_ratios = []
	widths = []
	heights = []

	for cnt in contours:
	x, y, w, h = cv2.boundingRect(cnt)
	# Filter noise
	if w < 5 or h < 5:
	continue
	aspect_ratios.append(w / h)
	widths.append(w)
	heights.append(h)

	if not aspect_ratios:
	return "cursive"

	avg_aspect_ratio = np.mean(aspect_ratios)
	median_width = np.median(widths)

	# Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
	# Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
	print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")

	if avg_aspect_ratio < 1.3:
	return "printed"
	else:
	return "cursive"

	except Exception as e:
	print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
	return "cursive"