Spaces:
Sleeping
Sleeping
File size: 10,159 Bytes
2f4af3f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 | import cv2
import numpy as np
from PIL import Image
from typing import List, Dict, Tuple
class LayoutParser:
def __init__(self):
pass
def analyze_layout(self, image_path: str) -> Dict:
"""Analyze document image layout to detect columns, blocks, and lines of text"""
try:
img = cv2.imread(image_path)
if img is None:
raise FileNotFoundError(f"Image not found: {image_path}")
h_img, w_img, _ = img.shape
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Step 1: Preprocess to remove noise and binarize
# Use Otsu's thresholding after Gaussian blur
blur = cv2.GaussianBlur(gray, (5, 5), 0)
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Step 2: Dilation to merge words into horizontal line segments
# Use larger horizontal kernel to join words along text lines
line_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 3))
dilated = cv2.dilate(thresh, line_kernel, iterations=2)
# Step 3: Find contours of lines
contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
lines = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
# Filter out small noise and full page boundaries
if w < 15 or h < 5:
continue
if w > w_img * 0.98 or h > h_img * 0.98:
continue
lines.append({
"box": (x, y, w, h),
"area": w * h
})
# Sort lines from top-to-bottom, left-to-right (handles multi-column layouts)
# We group lines into columns based on horizontal positions
lines = sorted(lines, key=lambda l: l["box"][1]) # sort by top coord first
columns = self._group_lines_into_columns(lines, w_img)
structured_layout = {
"width": w_img,
"height": h_img,
"column_count": len(columns),
"columns": columns
}
print(f"[INFO] Layout parsing complete. Detected {len(columns)} text columns.")
return structured_layout
except Exception as e:
print(f"[ERROR] Layout parsing failed: {e}")
return {"width": 0, "height": 0, "column_count": 1, "columns": []}
def _group_lines_into_columns(self, lines: List[Dict], page_width: int) -> List[Dict]:
"""Group detected text lines into column blocks based on horizontal overlap"""
if not lines:
return []
# Find horizontal overlaps using a histogram projection
hist = np.zeros(page_width, dtype=np.int32)
for line in lines:
x, _, w, _ = line["box"]
hist[x:x+w] += 1
# Threshold histogram to find column boundaries
min_col_width = int(page_width * 0.1)
columns_x = []
in_col = False
start_x = 0
for x, val in enumerate(hist):
if val > 1 and not in_col:
in_col = True
start_x = x
elif val <= 1 and in_col:
in_col = False
end_x = x
if (end_x - start_x) >= min_col_width:
columns_x.append((start_x, end_x))
# Handle case where column stretches to the end
if in_col:
columns_x.append((start_x, page_width))
if not columns_x:
columns_x = [(0, page_width)]
# Assign lines to closest columns
cols_data = [{"x_range": rx, "lines": []} for rx in columns_x]
for line in lines:
x, y, w, h = line["box"]
line_center_x = x + w / 2
# Find the best column index
best_idx = 0
min_dist = page_width
for idx, col in enumerate(cols_data):
cx_start, cx_end = col["x_range"]
if cx_start <= line_center_x <= cx_end:
best_idx = idx
break
else:
dist = min(abs(line_center_x - cx_start), abs(line_center_x - cx_end))
if dist < min_dist:
min_dist = dist
best_idx = idx
cols_data[best_idx]["lines"].append((x, y, w, h))
# Sort lines inside each column by vertical (y) coordinate
for col in cols_data:
col["lines"] = sorted(col["lines"], key=lambda box: box[1])
return cols_data
def crop_lines(self, image_path: str, layout: Dict) -> List[Image.Image]:
"""Crop and return PIL images of detected text lines in reading order"""
try:
img = cv2.imread(image_path)
if img is None:
return []
crops = []
h_img, w_img, _ = img.shape
for col in layout.get("columns", []):
for (x, y, w, h) in col["lines"]:
# Add small padding for HTR/OCR context
pad_y = int(h * 0.1) + 2
pad_x = int(w * 0.05) + 2
y0 = max(0, y - pad_y)
y1 = min(h_img, y + h + pad_y)
x0 = max(0, x - pad_x)
x1 = min(w_img, x + w + pad_x)
crop = img[y0:y1, x0:x1]
if crop.size > 0:
crops.append(Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)))
return crops
except Exception as e:
print(f"[ERROR] Failed to crop layout lines: {e}")
return []
def detect_writing_style(self, image_path: str, clip_classifier=None) -> str:
"""Detect if document image contains 'printed' capital letters or 'cursive' handwriting"""
try:
# 1. Try using CLIP classifier if provided
if clip_classifier and clip_classifier.model and clip_classifier.processor:
try:
from PIL import Image
image = Image.open(image_path).convert("RGB")
styles = ["printed", "cursive"]
descriptions = [
"classical printed Latin text or carved Roman stone monumental inscription with clean block capital letters",
"medieval handwritten Latin manuscript text written in ink on parchment with cursive handwriting"
]
inputs = clip_classifier.processor(
text=descriptions,
images=image,
return_tensors="pt",
padding=True
).to(clip_classifier.device)
import torch
with torch.no_grad():
outputs = clip_classifier.model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
best_idx = np.argmax(probs)
style_label = styles[best_idx]
confidence = float(probs[best_idx])
print(f"[INFO] CLIP Latin style classification: {style_label} ({confidence:.3f})")
return style_label
except Exception as e:
print(f"[WARN] CLIP Latin style detection failed: {e}. Falling back to heuristics.")
# 2. Fallback: Computer Vision heuristics
print("[INFO] Running computer vision heuristics for Latin style detection...")
img = cv2.imread(image_path)
if img is None:
return "cursive" # Safe default
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5, 5), 0)
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# Find contours without heavy dilation (character level components)
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
if not contours:
return "cursive"
aspect_ratios = []
widths = []
heights = []
for cnt in contours:
x, y, w, h = cv2.boundingRect(cnt)
# Filter noise
if w < 5 or h < 5:
continue
aspect_ratios.append(w / h)
widths.append(w)
heights.append(h)
if not aspect_ratios:
return "cursive"
avg_aspect_ratio = np.mean(aspect_ratios)
median_width = np.median(widths)
# Printed characters are typically individual, tall/square shapes: width ~ height, aspect ratio close to 0.7 - 1.2
# Cursive handwriting consists of connected letters, forming wider horizontal segments: aspect ratio > 1.5
print(f"[DEBUG] Layout heuristics - connected components: {len(aspect_ratios)}, avg aspect ratio: {avg_aspect_ratio:.3f}, median width: {median_width:.1f}")
if avg_aspect_ratio < 1.3:
return "printed"
else:
return "cursive"
except Exception as e:
print(f"[WARN] Latin style detection failed completely: {e}. Defaulting to cursive.")
return "cursive"
|