stella-score-reader / score_preprocess.py
CAY96
์ „์ฒ˜๋ฆฌยท์กฐํ‘œ ํŒŒ์‹ฑยทmeta/debugยท๋ฌธ์„œ
e79fe0c
"""
์•…๋ณด ํŽ˜์ด์ง€ ๋‹จ์œ„ ์ „์ฒ˜๋ฆฌ: ํ•ด์ƒ๋„ ์ƒํ•œ, ์กฐ๋ช… ์ •๊ทœํ™”, LAB-CLAHE, ์„ ํƒ์  ์›๊ทผ ๋ณด์ •,
๊ฐ€๋ฒผ์šด ์—ฃ์ง€ ๋ณด์กด ์Šค๋ฌด๋”ฉยท์–ธ์ƒคํ”„, ์†Œ๊ฐ๋„ ๋ฐ์Šคํ.
๊ฒ€์ถœยท์ธ์‹์€ ์ „์ฒ˜๋ฆฌ๋œ ์ด๋ฏธ์ง€์—์„œ ์ˆ˜ํ–‰ํ•˜๊ณ , ์‘๋‹ต bbox๋Š” ์—…๋กœ๋“œ ์›๋ณธ ํ”ฝ์…€ ์ขŒํ‘œ๋กœ ๋˜๋Œ๋ฆฐ๋‹ค.
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
from typing import Any, Dict, List, Tuple
import cv2
import numpy as np
_MAX_SIDE_DEFAULT = int(os.environ.get("STELLA_PREPROCESS_MAX_SIDE", "4096"))
_SKIP_DESKEW = os.environ.get("STELLA_SKIP_DESKEW", "").lower() in ("1", "true", "yes")
_SKIP_PERSPECTIVE = os.environ.get("STELLA_SKIP_PERSPECTIVE", "").lower() in ("1", "true", "yes")
# ์›๊ทผ ๋ณด์ •์€ ์•…๋ณด ๋‹จ๋… ํฌ๋กญ์—์„œ ์˜คํƒ ์‹œ ํ™”์งˆ์„ ๋ง๊ฐ€๋œจ๋ฆด ์ˆ˜ ์žˆ์–ด ๊ธฐ๋ณธ ๋”. ๋ฌธ์„œํ˜• ์ดฌ์˜๋งŒ ์ผ ๋‹ค.
_ENABLE_PERSPECTIVE = os.environ.get("STELLA_ENABLE_PERSPECTIVE", "").lower() in ("1", "true", "yes")
_SKIP_ILLUMINATION = os.environ.get("STELLA_SKIP_ILLUMINATION", "").lower() in ("1", "true", "yes")
_SKIP_ENHANCE = os.environ.get("STELLA_SKIP_ENHANCE", "").lower() in ("1", "true", "yes")
_DESKEW_MIN_DEG = 0.35
_DESKEW_MAX_DEG = 6.0
# ์›๊ทผ: ๋„ˆ๋ฌด ์•ฝํ•˜๋ฉด ํšจ๊ณผ ์—†์Œ, ๋„ˆ๋ฌด ๊ณต๊ฒฉ์ ์ด๋ฉด ์•…๋ณด๋ฅผ ์ฐŒ๊ทธ๋Ÿฌ๋œจ๋ฆผ
_PERSP_MIN_AREA_RATIO = 0.22
_PERSP_FULL_FRAME_RATIO = 0.92
_PERSP_MAX_EDGE_RATIO = 0.035
@dataclass
class PageGeometry:
"""์ตœ์ข… work ํ”ฝ์…€ โ†’ ์—…๋กœ๋“œ ์›๋ณธ(๋””์ฝ”๋“œ) ํ”ฝ์…€. ๋‚ด๋ถ€์ ์œผ๋กœ 3ร—3 ๋™์ฐจ ๋ณ€ํ™˜."""
orig_w: int
orig_h: int
work_w: int
work_h: int
scale_x: float # work_w / orig_w (์—ญ๋ณ€ํ™˜์— ์‚ฌ์šฉ)
scale_y: float # work_h / orig_h
deskew_deg_applied: float
_work_to_orig: np.ndarray = field(repr=False)
def work_point_to_original(self, x: float, y: float) -> Tuple[int, int]:
v = self._work_to_orig @ np.array([float(x), float(y), 1.0], dtype=np.float64)
wv = float(v[2])
if abs(wv) < 1e-12:
xo = float(v[0]) / self.scale_x
yo = float(v[1]) / self.scale_y
else:
xo = float(v[0]) / wv
yo = float(v[1]) / wv
return int(round(xo)), int(round(yo))
def work_rect_to_original_aabb(self, x0: int, y0: int, w: int, h: int) -> List[int]:
corners = [
(x0, y0),
(x0 + w, y0),
(x0, y0 + h),
(x0 + w, y0 + h),
]
mapped = [self.work_point_to_original(px, py) for px, py in corners]
xs = [p[0] for p in mapped]
ys = [p[1] for p in mapped]
x_min, x_max = min(xs), max(xs)
y_min, y_max = min(ys), max(ys)
return [x_min, y_min, max(1, x_max - x_min), max(1, y_max - y_min)]
def _clahe_lab_bgr(image_bgr: np.ndarray, clip_limit: float = 2.0, grid: int = 8) -> np.ndarray:
lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB)
l, a, b_ch = cv2.split(lab)
clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=(grid, grid))
l2 = clahe.apply(l)
merged = cv2.merge((l2, a, b_ch))
return cv2.cvtColor(merged, cv2.COLOR_LAB2BGR)
def _normalize_illumination_lab(image_bgr: np.ndarray) -> np.ndarray:
"""์ €์ฃผํŒŒ ์กฐ๋ช… ์„ฑ๋ถ„์œผ๋กœ L ์ฑ„๋„์„ ๋‚˜๋ˆ„์–ด ๊ทธ๋ฆผ์žยท๋น„๋„คํŒ…์„ ์™„ํ™”."""
lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB)
l, a, b_ch = cv2.split(lab)
h, w = l.shape[:2]
k = int(round(min(h, w) * 0.06))
k = max(31, k | 1)
bg = cv2.GaussianBlur(l, (k, k), 0)
bg_f = np.maximum(bg.astype(np.float32), 8.0)
l_f = l.astype(np.float32)
l2 = np.clip((l_f / bg_f) * 96.0 + 48.0, 0, 255).astype(np.uint8)
merged = cv2.merge((l2, a, b_ch))
return cv2.cvtColor(merged, cv2.COLOR_LAB2BGR)
def _mild_unsharp_lab_bgr(image_bgr: np.ndarray, sigma: float = 1.0, amount: float = 0.35) -> np.ndarray:
lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB)
l, a, b_ch = cv2.split(lab)
blur = cv2.GaussianBlur(l, (0, 0), sigma)
l2 = cv2.addWeighted(l, 1.0 + amount, blur, -amount, 0)
merged = cv2.merge((l2, a, b_ch))
return cv2.cvtColor(merged, cv2.COLOR_LAB2BGR)
def _bilateral_or_light_blur_bgr(image_bgr: np.ndarray) -> Tuple[np.ndarray, str]:
h, w = image_bgr.shape[:2]
m = max(h, w)
if m <= 2000:
out = cv2.bilateralFilter(image_bgr, d=5, sigmaColor=42, sigmaSpace=42)
return out, "bilateral_d5"
if m <= 3200:
out = cv2.bilateralFilter(image_bgr, d=3, sigmaColor=38, sigmaSpace=38)
return out, "bilateral_d3"
lab = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2LAB)
l, a, b_ch = cv2.split(lab)
l2 = cv2.GaussianBlur(l, (3, 3), 0)
merged = cv2.merge((l2, a, b_ch))
return cv2.cvtColor(merged, cv2.COLOR_LAB2BGR), "gaussian_l3_skip_heavy_bilateral"
def _order_quad_points(pts: np.ndarray) -> np.ndarray:
"""pts (4,2) float โ†’ [tl, tr, br, bl]."""
rect = np.zeros((4, 2), dtype=np.float32)
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1).flatten()
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def _quad_corners_near_frame(ordered: np.ndarray, w: int, h: int, frac: float = 0.16) -> bool:
"""์‹ค์ œ ๋ฌธ์„œ/ํŽ˜์ด์ง€ ๋ชจ์„œ๋ฆฌ๋Š” ๋ณดํ†ต ํ”„๋ ˆ์ž„ ๊ฐ€๊นŒ์ด์— ์žˆ๋‹ค. ๋‚ด๋ถ€ ์žก์Œ ์‚ฌ๊ฐํ˜•์„ ๊ฑธ๋Ÿฌ๋‚ธ๋‹ค."""
lim = frac * float(min(w, h))
for x, y in ordered:
d = min(float(x), float(y), float(w - 1) - float(x), float(h - 1) - float(y))
if d > lim:
return False
return True
def _try_perspective_rectify(image_bgr: np.ndarray) -> Tuple[np.ndarray, np.ndarray, str]:
"""
๋ฌธ์„œํ˜• ์‚ฌ๊ฐํ˜•์„ ์ฐพ์œผ๋ฉด ์›๊ทผ ๋ณด์ •. ๋ฐ˜ํ™˜: (warped_bgr, M_src_to_dst, reason).
M_src_to_dst ๋Š” getPerspectiveTransform ๊ทœ์•ฝ: dst_hom โˆ M @ src_hom.
"""
h, w = image_bgr.shape[:2]
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5, 5), 0)
edges = cv2.Canny(blur, 35, 110)
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8), iterations=1)
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
img_area = float(h * w)
min_area = _PERSP_MIN_AREA_RATIO * img_area
for cnt in sorted(contours, key=cv2.contourArea, reverse=True)[:8]:
area = cv2.contourArea(cnt)
if area < min_area:
break
peri = cv2.arcLength(cnt, True)
if peri < 1e-6:
continue
approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
if len(approx) != 4:
continue
if not cv2.isContourConvex(approx):
continue
if area > _PERSP_FULL_FRAME_RATIO * img_area:
return image_bgr, np.eye(3, dtype=np.float64), "skipped_near_full_frame"
pts = approx.reshape(4, 2).astype(np.float32)
ordered = _order_quad_points(pts)
side = min(
np.linalg.norm(ordered[0] - ordered[1]),
np.linalg.norm(ordered[1] - ordered[2]),
np.linalg.norm(ordered[2] - ordered[3]),
np.linalg.norm(ordered[3] - ordered[0]),
)
if side < _PERSP_MAX_EDGE_RATIO * float(min(h, w)):
continue
if not _quad_corners_near_frame(ordered, w, h):
continue
width_top = np.linalg.norm(ordered[0] - ordered[1])
width_bot = np.linalg.norm(ordered[3] - ordered[2])
height_r = np.linalg.norm(ordered[1] - ordered[2])
height_l = np.linalg.norm(ordered[0] - ordered[3])
max_w = int(max(width_top, width_bot))
max_h = int(max(height_r, height_l))
max_w = max(max_w, 32)
max_h = max(max_h, 32)
dst = np.array(
[[0, 0], [max_w - 1, 0], [max_w - 1, max_h - 1], [0, max_h - 1]],
dtype=np.float32,
)
m_src_to_dst = cv2.getPerspectiveTransform(ordered, dst)
warped = cv2.warpPerspective(
image_bgr,
m_src_to_dst,
(max_w, max_h),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE,
)
return warped, m_src_to_dst.astype(np.float64), "applied"
return image_bgr, np.eye(3, dtype=np.float64), "no_quadrilateral_found"
def estimate_deskew_angle_deg(gray: np.ndarray) -> float:
"""์ˆ˜ํ‰ ์˜ค์„ ์— ๊ฐ€๊นŒ์šด ์„ ๋ถ„ ๊ฐ๋„(๋„). ์–‘(+)์€ ์˜ค๋ฅธ์ชฝ์ด ์•„๋ž˜๋กœ ๋‚ด๋ ค๊ฐ„ ๊ธฐ์šธ๊ธฐ์— ๋Œ€์‘."""
h, w = gray.shape[:2]
max_side = max(h, w)
scale = 800.0 / max_side if max_side > 800 else 1.0
if scale < 1.0:
small = cv2.resize(gray, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_AREA)
else:
small = gray
blur = cv2.GaussianBlur(small, (3, 3), 0)
_, bw = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
kernel_w = max(15, small.shape[1] // 20)
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_w, 1))
horiz = cv2.morphologyEx(bw, cv2.MORPH_OPEN, h_kernel, iterations=1)
min_len = max(30, int(small.shape[1] * 0.18))
lines = cv2.HoughLinesP(
horiz,
1,
np.pi / 180,
threshold=40,
minLineLength=min_len,
maxLineGap=25,
)
if lines is None:
return 0.0
angles: List[float] = []
for x1, y1, x2, y2 in lines.reshape(-1, 4):
dx, dy = float(x2 - x1), float(y2 - y1)
if abs(dx) < 2.0:
continue
ang = float(np.degrees(np.arctan2(dy, dx)))
if ang > 45.0:
ang -= 180.0
if ang < -45.0:
ang += 180.0
if abs(ang) < 14.0:
angles.append(ang)
if not angles:
return 0.0
return float(np.median(angles))
def preprocess_page_bgr(image_bgr: np.ndarray) -> Tuple[np.ndarray, PageGeometry, Dict[str, Any]]:
"""
1) ๊ธด ๋ณ€ max_side ์ƒํ•œ์œผ๋กœ ์ถ•์†Œ
2) LAB ๊ธฐ๋ฐ˜ ์กฐ๋ช… ์ •๊ทœํ™”(์„ ํƒ)
3) LAB CLAHE
4) ์›๊ทผ ๋ณด์ •(์„ ํƒ, ๋ฌธ์„œํ˜• ์‚ฌ๊ฐํ˜• ๊ฒ€์ถœ ์‹œ)
5) bilateral ๋˜๋Š” ๊ฐ€๋ฒผ์šด L ๋ธ”๋Ÿฌ(์„ ํƒ)
6) LAB ์–ธ์ƒคํ”„(์„ ํƒ)
7) |๊ฐ|์ด [_DESKEW_MIN_DEG, _DESKEW_MAX_DEG] ์•ˆ์ด๋ฉด ๋ฐ์Šคํ
"""
meta: Dict[str, Any] = {
"illumination_normalized": False,
"perspective": "off",
"perspective_detail": None,
"enhance": None,
"unsharp": False,
}
orig_h, orig_w = image_bgr.shape[:2]
max_side = max(orig_h, orig_w)
cap = max(512, _MAX_SIDE_DEFAULT)
scale_f = min(1.0, float(cap) / float(max_side))
if scale_f < 1.0:
work = cv2.resize(
image_bgr,
(int(round(orig_w * scale_f)), int(round(orig_h * scale_f))),
interpolation=cv2.INTER_AREA,
)
else:
work = image_bgr.copy()
wh, ww = work.shape[:2]
sx = ww / float(orig_w)
sy = wh / float(orig_h)
t_scale = np.diag([sx, sy, 1.0]).astype(np.float64)
if not _SKIP_ILLUMINATION:
work = _normalize_illumination_lab(work)
meta["illumination_normalized"] = True
work = _clahe_lab_bgr(work)
wh, ww = work.shape[:2]
m_src_to_dst = np.eye(3, dtype=np.float64)
if _SKIP_PERSPECTIVE:
meta["perspective"] = "skipped"
meta["perspective_detail"] = "disabled_by_env"
elif not _ENABLE_PERSPECTIVE:
meta["perspective"] = "skipped"
meta["perspective_detail"] = "disabled_by_default_set_STELLA_ENABLE_PERSPECTIVE"
else:
work, m_src_to_dst, persp_reason = _try_perspective_rectify(work)
meta["perspective"] = "applied" if persp_reason == "applied" else "skipped"
meta["perspective_detail"] = persp_reason
wh, ww = work.shape[:2]
if not _SKIP_ENHANCE:
work, enh_label = _bilateral_or_light_blur_bgr(work)
meta["enhance"] = enh_label
work = _mild_unsharp_lab_bgr(work, sigma=1.0, amount=0.32)
meta["unsharp"] = True
else:
meta["enhance"] = "disabled_by_env"
meta["unsharp"] = False
deskew_applied = 0.0
a_desk = np.eye(3, dtype=np.float64)
if not _SKIP_DESKEW:
gray = cv2.cvtColor(work, cv2.COLOR_BGR2GRAY)
skew = estimate_deskew_angle_deg(gray)
if _DESKEW_MIN_DEG <= abs(skew) <= _DESKEW_MAX_DEG:
correction = -skew
deskew_applied = correction
m23 = cv2.getRotationMatrix2D((ww / 2.0, wh / 2.0), correction, 1.0)
work = cv2.warpAffine(
work,
m23,
(ww, wh),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_REPLICATE,
)
a_desk = np.vstack([m23.astype(np.float64), np.array([[0.0, 0.0, 1.0]], dtype=np.float64)])
# ์ตœ์ข… work โ†’ ์›๋ณธ: inv(T_scale) @ inv(M_src_to_dst) @ A_desk
# (orig โ†’ final: inv(A) @ M @ T)
m_inv = np.linalg.inv(m_src_to_dst)
work_to_orig = np.linalg.inv(t_scale) @ m_inv @ a_desk
geom = PageGeometry(
orig_w=orig_w,
orig_h=orig_h,
work_w=int(work.shape[1]),
work_h=int(work.shape[0]),
scale_x=sx,
scale_y=sy,
deskew_deg_applied=deskew_applied,
_work_to_orig=work_to_orig,
)
return work, geom, meta