Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Notebook Auto-Crop Tool v5 — Tight-Crop Fix | |
| """ | |
| import cv2 | |
| import numpy as np | |
| import sys | |
| import os | |
| import json | |
| from pathlib import Path | |
| from google import genai | |
| from google.genai import types | |
| def order_points(pts): | |
| rect = np.zeros((4, 2), dtype="float32") | |
| s = pts.sum(axis=1) | |
| rect[0] = pts[np.argmin(s)] | |
| rect[2] = pts[np.argmax(s)] | |
| diff = np.diff(pts, axis=1) | |
| rect[1] = pts[np.argmin(diff)] | |
| rect[3] = pts[np.argmax(diff)] | |
| return rect | |
| def four_point_transform(image, pts): | |
| rect = order_points(pts) | |
| (tl, tr, br, bl) = rect | |
| maxW = max(int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl))), 1) | |
| maxH = max(int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl))), 1) | |
| dst = np.array([[0, 0], [maxW-1, 0], [maxW-1, maxH-1], [0, maxH-1]], dtype="float32") | |
| M = cv2.getPerspectiveTransform(rect, dst) | |
| return cv2.warpPerspective(image, M, (maxW, maxH)) | |
| def is_valid_quad(quad, img_shape): | |
| ordered = order_points(quad.astype(np.float32)) | |
| for i in range(4): | |
| v1 = ordered[(i - 1) % 4] - ordered[i] | |
| v2 = ordered[(i + 1) % 4] - ordered[i] | |
| denom = np.linalg.norm(v1) * np.linalg.norm(v2) | |
| if denom < 1e-6: | |
| return False | |
| angle = np.degrees(np.arccos(np.clip(np.dot(v1, v2) / denom, -1, 1))) | |
| if angle < 30 or angle > 150: | |
| return False | |
| w1 = np.linalg.norm(ordered[1] - ordered[0]) | |
| w2 = np.linalg.norm(ordered[2] - ordered[3]) | |
| h1 = np.linalg.norm(ordered[3] - ordered[0]) | |
| h2 = np.linalg.norm(ordered[2] - ordered[1]) | |
| avg_w, avg_h = (w1 + w2) / 2, (h1 + h2) / 2 | |
| if min(avg_w, avg_h) < 1: | |
| return False | |
| return max(avg_w, avg_h) / min(avg_w, avg_h) <= 5.0 | |
| def expand_quad(quad, img_shape, margin_frac=0.025): | |
| center = quad.mean(axis=0) | |
| expanded = quad.copy().astype(np.float32) | |
| for i in range(len(quad)): | |
| vec = quad[i] - center | |
| expanded[i] = quad[i] + vec * margin_frac | |
| h, w = img_shape[:2] | |
| expanded[:, 0] = np.clip(expanded[:, 0], 0, w - 1) | |
| expanded[:, 1] = np.clip(expanded[:, 1], 0, h - 1) | |
| return expanded | |
| def get_binary_strategies(work_img): | |
| gray = cv2.cvtColor(work_img, cv2.COLOR_BGR2GRAY) | |
| h, w = gray.shape | |
| k_close = np.ones((15, 15), np.uint8) | |
| k_open = np.ones((5, 5), np.uint8) | |
| strats = [] | |
| blurred = cv2.GaussianBlur(gray, (15, 15), 0) | |
| _, otsu = cv2.threshold(blurred, 0, 255, | |
| cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| otsu = cv2.morphologyEx(otsu, cv2.MORPH_CLOSE, k_close, iterations=3) | |
| otsu = cv2.morphologyEx(otsu, cv2.MORPH_OPEN, k_open, iterations=1) | |
| strats.append(("Otsu", otsu)) | |
| hsv = cv2.cvtColor(work_img, cv2.COLOR_BGR2HSV) | |
| v_ch = cv2.GaussianBlur(hsv[:, :, 2], (15, 15), 0) | |
| _, v_t = cv2.threshold(v_ch, 0, 255, | |
| cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| v_t = cv2.morphologyEx(v_t, cv2.MORPH_CLOSE, k_close, iterations=3) | |
| v_t = cv2.morphologyEx(v_t, cv2.MORPH_OPEN, k_open, iterations=1) | |
| strats.append(("HSV-V", v_t)) | |
| bilateral = cv2.bilateralFilter(gray, 9, 75, 75) | |
| bilateral = cv2.GaussianBlur(bilateral, (11, 11), 0) | |
| _, bil_t = cv2.threshold(bilateral, 0, 255, | |
| cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_CLOSE, k_close, iterations=3) | |
| bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_OPEN, k_open, iterations=1) | |
| strats.append(("Bilateral", bil_t)) | |
| b2 = cv2.GaussianBlur(gray, (9, 9), 0) | |
| edges = cv2.Canny(b2, 25, 80) | |
| edges = cv2.dilate(edges, np.ones((7, 7), np.uint8), iterations=3) | |
| edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, | |
| np.ones((13, 13), np.uint8), iterations=2) | |
| flood = edges.copy() | |
| fmask = np.zeros((h + 2, w + 2), np.uint8) | |
| step = max(1, min(w, h) // 20) | |
| for x in range(0, w, step): | |
| if flood[0, x] == 0: | |
| cv2.floodFill(flood, fmask, (x, 0), 128) | |
| if flood[h - 1, x] == 0: | |
| cv2.floodFill(flood, fmask, (x, h - 1), 128) | |
| for y in range(0, h, step): | |
| if flood[y, 0] == 0: | |
| cv2.floodFill(flood, fmask, (0, y), 128) | |
| if flood[y, w - 1] == 0: | |
| cv2.floodFill(flood, fmask, (w - 1, y), 128) | |
| doc = np.where(flood == 128, 0, 255).astype(np.uint8) | |
| doc = cv2.morphologyEx(doc, cv2.MORPH_CLOSE, k_close, iterations=2) | |
| strats.append(("FloodFill", doc)) | |
| return strats | |
| def find_notebook_contour(work_img): | |
| strategies = get_binary_strategies(work_img) | |
| img_area = work_img.shape[0] * work_img.shape[1] | |
| best_quad = None | |
| best_area = 0 | |
| all_quads = [] | |
| is_fallback = False | |
| max_cnt = None | |
| max_cnt_area = 0 | |
| for name, binary in strategies: | |
| contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, | |
| cv2.CHAIN_APPROX_SIMPLE) | |
| contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5] | |
| for cnt in contours: | |
| area = cv2.contourArea(cnt) | |
| if area > max_cnt_area: | |
| max_cnt_area = area | |
| max_cnt = cnt | |
| if area < 0.15 * img_area: | |
| continue | |
| peri = cv2.arcLength(cnt, True) | |
| for eps in np.linspace(0.01, 0.1, 20): | |
| approx = cv2.approxPolyDP(cnt, eps * peri, True) | |
| if len(approx) == 4: | |
| q = approx.reshape(4, 2).astype(np.float32) | |
| if is_valid_quad(q, work_img.shape): | |
| all_quads.append(q) | |
| if area > best_area: | |
| best_area = area | |
| best_quad = q | |
| break | |
| elif len(approx) < 4: | |
| break | |
| hull = cv2.convexHull(cnt) | |
| peri_h = cv2.arcLength(hull, True) | |
| for eps in np.linspace(0.01, 0.1, 20): | |
| approx = cv2.approxPolyDP(hull, eps * peri_h, True) | |
| if len(approx) == 4: | |
| q = approx.reshape(4, 2).astype(np.float32) | |
| if is_valid_quad(q, work_img.shape): | |
| all_quads.append(q) | |
| if area > best_area: | |
| best_area = area | |
| best_quad = q | |
| break | |
| elif len(approx) < 4: | |
| break | |
| if area > 0.20 * img_area: | |
| box = cv2.boxPoints(cv2.minAreaRect(cnt)).astype(np.float32) | |
| if is_valid_quad(box, work_img.shape): | |
| all_quads.append(box) | |
| if area * 0.90 > best_area: | |
| best_area = area * 0.90 | |
| best_quad = box | |
| if best_quad is None and max_cnt is not None \ | |
| and max_cnt_area > 0.10 * img_area: | |
| box = cv2.boxPoints(cv2.minAreaRect(max_cnt)).astype(np.float32) | |
| best_quad = box | |
| all_quads.append(box) | |
| is_fallback = True | |
| return best_quad, all_quads, is_fallback | |
| def draw_debug_image(work_img, corners, all_quads, is_fallback): | |
| debug = work_img.copy() | |
| h, w = debug.shape[:2] | |
| for q in all_quads: | |
| cv2.polylines(debug, [q.astype(np.int32)], True, (0, 255, 255), 1) | |
| if corners is not None: | |
| color = (0, 165, 255) if is_fallback else (0, 255, 0) | |
| cv2.polylines(debug, [corners.astype(np.int32)], True, color, 3) | |
| ordered = order_points(corners) | |
| for i, (pt, lbl, c) in enumerate(zip( | |
| ordered, ["TL","TR","BR","BL"], | |
| [(255,0,0),(0,0,255),(255,0,255),(0,255,0)])): | |
| cx, cy = int(pt[0]), int(pt[1]) | |
| cv2.circle(debug, (cx, cy), 8, c, -1) | |
| cv2.putText(debug, lbl, (cx+10, cy+5), | |
| cv2.FONT_HERSHEY_SIMPLEX, 0.6, c, 2) | |
| cv2.rectangle(debug, (0, 0), (w, 40), (0, 0, 0), -1) | |
| if corners is not None: | |
| s, c = ("FALLBACK", (0,165,255)) if is_fallback \ | |
| else ("QUAD DETECTED (green outline)", (0,255,0)) | |
| else: | |
| s, c = "NOTHING DETECTED", (0, 0, 255) | |
| cv2.putText(debug, s, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2) | |
| return debug | |
| def save_binary_debug(work_img, debug_path): | |
| strategies = get_binary_strategies(work_img) | |
| panels = [] | |
| tw = 300 | |
| for name, pan in strategies: | |
| r = tw / pan.shape[1] | |
| res = cv2.resize(pan, (tw, int(pan.shape[0] * r))) | |
| cp = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR) | |
| cv2.putText(cp, name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, | |
| (0, 255, 0), 2) | |
| panels.append(cp) | |
| mh = max(p.shape[0] for p in panels) | |
| padded = [] | |
| for p in panels: | |
| if p.shape[0] < mh: | |
| p = np.vstack([p, np.zeros((mh - p.shape[0], p.shape[1], 3), | |
| np.uint8)]) | |
| padded.append(p) | |
| cv2.imwrite(debug_path.replace("_debug.", "_binary_debug."), | |
| np.hstack(padded), [cv2.IMWRITE_JPEG_QUALITY, 85]) | |
| def get_rotation_from_gemini(image_bytes: bytes) -> str: | |
| api_key = os.environ.get("GEMINI_API_KEY") | |
| if not api_key: | |
| print("[WARN] GEMINI_API_KEY not set. Defaulting to 90_counterclockwise", flush=True) | |
| return "90_counterclockwise" | |
| client = genai.Client(api_key=api_key) | |
| model = "gemini-3.1-flash-lite-preview" | |
| contents = [ | |
| types.Content( | |
| role="user", | |
| parts=[ | |
| # Defaulting to image/jpeg, handles most cases | |
| types.Part.from_bytes(mime_type="image/jpeg", data=image_bytes), | |
| ], | |
| ), | |
| types.Content( | |
| role="model", | |
| parts=[ | |
| types.Part.from_text(text="""```json\n{"rotation": "0"}\n```"""), | |
| ], | |
| ), | |
| types.Content( | |
| role="user", | |
| parts=[ | |
| types.Part.from_text(text="""Determine the rotation needed to make this image readable."""), | |
| ], | |
| ), | |
| ] | |
| generate_content_config = types.GenerateContentConfig( | |
| system_instruction=[ | |
| types.Part.from_text(text='''you are the AI which detects which orientation the image should be rotated such that the text becomes readable. | |
| output strict json: | |
| {"rotation": "90_counterclockwise", "90_clockwise", "180", "0"}'''), | |
| ], | |
| temperature=0.0 | |
| ) | |
| try: | |
| response = client.models.generate_content( | |
| model=model, | |
| contents=contents, | |
| config=generate_content_config, | |
| ) | |
| text = response.text | |
| if "```json" in text: | |
| text = text.split("```json")[1].split("```")[0].strip() | |
| elif "```" in text: | |
| text = text.split("```")[1].split("```")[0].strip() | |
| data = json.loads(text) | |
| return data.get("rotation", "0") | |
| except Exception as e: | |
| print(f"[ERROR] Gemini rotation detection failed: {e}", flush=True) | |
| return "90_counterclockwise" | |
| def process_image(input_path: str): | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| image = cv2.imread(input_path) | |
| if image is None: | |
| print(f"[ERROR] Cannot read: {input_path}") | |
| return | |
| with open(input_path, "rb") as f: | |
| image_bytes = f.read() | |
| rotation_str = get_rotation_from_gemini(image_bytes) | |
| print(f"[INFO] Gemini detected rotation: {rotation_str}", flush=True) | |
| if rotation_str == "90_counterclockwise": | |
| rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) | |
| elif rotation_str == "90_clockwise": | |
| rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) | |
| elif rotation_str == "180": | |
| rotated = cv2.rotate(image, cv2.ROTATE_180) | |
| else: | |
| rotated = image | |
| orig_h, orig_w = rotated.shape[:2] | |
| max_dim = 800.0 | |
| ratio = max(orig_h, orig_w) / max_dim | |
| work_w = int(orig_w / ratio) | |
| work_h = int(orig_h / ratio) | |
| work_img = cv2.resize(rotated, (work_w, work_h)) | |
| corners, all_quads, is_fallback = find_notebook_contour(work_img) | |
| stem = Path(input_path).stem | |
| debug_path = os.path.join(script_dir, f"{stem}_debug.jpg") | |
| if corners is not None: | |
| corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025) | |
| scale_x = orig_w / work_w | |
| scale_y = orig_h / work_h | |
| corners_orig = corners_exp.copy() | |
| corners_orig[:, 0] *= scale_x | |
| corners_orig[:, 1] *= scale_y | |
| corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1) | |
| corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1) | |
| cropped = four_point_transform(rotated, corners_orig) | |
| print("[INFO] Success! Applied crop.") | |
| else: | |
| print("[WARN] Total failure. Returning full rotated image.") | |
| cropped = rotated | |
| debug_img = draw_debug_image(work_img, corners, all_quads, is_fallback) | |
| save_binary_debug(work_img, debug_path) | |
| cv2.imwrite(debug_path, debug_img, [cv2.IMWRITE_JPEG_QUALITY, 90]) | |
| out_path = os.path.join(script_dir, f"{stem}_cropped.jpg") | |
| cv2.imwrite(out_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95]) | |
| print(f"[INFO] Saved cropped: {out_path}") | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 2: | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp") | |
| skip = ("_cropped", "_debug", "_binary_debug") | |
| files = [f for f in os.listdir(script_dir) | |
| if f.lower().endswith(exts) | |
| and not any(s in f for s in skip)] | |
| if not files: | |
| print("Place images next to script or provide paths.") | |
| sys.exit(1) | |
| for fn in sorted(files): | |
| print(f"\nProcessing: {fn}") | |
| process_image(os.path.join(script_dir, fn)) | |
| else: | |
| for p in sys.argv[1:]: | |
| print(f"\nProcessing: {p}") | |
| process_image(p) | |
| def auto_crop_process(image_bytes: bytes) -> bytes: | |
| """ | |
| Exact logic from processor.py, but for in-memory bytes. | |
| 1. Decode JPEG/PNG bytes. | |
| 2. Rotate 90 deg CCW. | |
| 3. Detect and crop. | |
| 4. Return JPEG bytes. | |
| """ | |
| nparr = np.frombuffer(image_bytes, np.uint8) | |
| image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) | |
| if image is None: | |
| return image_bytes | |
| # 1. Rotate | |
| rotation_str = get_rotation_from_gemini(image_bytes) | |
| print(f"[PROCESS] Gemini detected rotation: {rotation_str}", flush=True) | |
| if rotation_str == "90_counterclockwise": | |
| rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) | |
| elif rotation_str == "90_clockwise": | |
| rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) | |
| elif rotation_str == "180": | |
| rotated = cv2.rotate(image, cv2.ROTATE_180) | |
| else: | |
| rotated = image | |
| orig_h, orig_w = rotated.shape[:2] | |
| # 2. Resize for detection | |
| max_dim = 800.0 | |
| ratio = max(orig_h, orig_w) / max_dim | |
| work_w = int(orig_w / ratio) | |
| work_h = int(orig_h / ratio) | |
| work_img = cv2.resize(rotated, (work_w, work_h)) | |
| # 3. Find contour | |
| corners, all_quads, is_fallback = find_notebook_contour(work_img) | |
| # 4. Transform | |
| if corners is not None: | |
| corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025) | |
| scale_x = orig_w / work_w | |
| scale_y = orig_h / work_h | |
| corners_orig = corners_exp.copy() | |
| corners_orig[:, 0] *= scale_x | |
| corners_orig[:, 1] *= scale_y | |
| corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1) | |
| corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1) | |
| cropped = four_point_transform(rotated, corners_orig) | |
| else: | |
| cropped = rotated | |
| # 5. Encode back to bytes | |
| _, result_bytes = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95]) | |
| return result_bytes.tobytes() |