#!/usr/bin/env python3 """ Notebook Auto-Crop Tool v5 — Tight-Crop Fix """ import cv2 import numpy as np import sys import os import json from pathlib import Path from google import genai from google.genai import types def order_points(pts): rect = np.zeros((4, 2), dtype="float32") s = pts.sum(axis=1) rect[0] = pts[np.argmin(s)] rect[2] = pts[np.argmax(s)] diff = np.diff(pts, axis=1) rect[1] = pts[np.argmin(diff)] rect[3] = pts[np.argmax(diff)] return rect def four_point_transform(image, pts): rect = order_points(pts) (tl, tr, br, bl) = rect maxW = max(int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl))), 1) maxH = max(int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl))), 1) dst = np.array([[0, 0], [maxW-1, 0], [maxW-1, maxH-1], [0, maxH-1]], dtype="float32") M = cv2.getPerspectiveTransform(rect, dst) return cv2.warpPerspective(image, M, (maxW, maxH)) def is_valid_quad(quad, img_shape): ordered = order_points(quad.astype(np.float32)) for i in range(4): v1 = ordered[(i - 1) % 4] - ordered[i] v2 = ordered[(i + 1) % 4] - ordered[i] denom = np.linalg.norm(v1) * np.linalg.norm(v2) if denom < 1e-6: return False angle = np.degrees(np.arccos(np.clip(np.dot(v1, v2) / denom, -1, 1))) if angle < 30 or angle > 150: return False w1 = np.linalg.norm(ordered[1] - ordered[0]) w2 = np.linalg.norm(ordered[2] - ordered[3]) h1 = np.linalg.norm(ordered[3] - ordered[0]) h2 = np.linalg.norm(ordered[2] - ordered[1]) avg_w, avg_h = (w1 + w2) / 2, (h1 + h2) / 2 if min(avg_w, avg_h) < 1: return False return max(avg_w, avg_h) / min(avg_w, avg_h) <= 5.0 def expand_quad(quad, img_shape, margin_frac=0.025): center = quad.mean(axis=0) expanded = quad.copy().astype(np.float32) for i in range(len(quad)): vec = quad[i] - center expanded[i] = quad[i] + vec * margin_frac h, w = img_shape[:2] expanded[:, 0] = np.clip(expanded[:, 0], 0, w - 1) expanded[:, 1] = np.clip(expanded[:, 1], 0, h - 1) return expanded def get_binary_strategies(work_img): gray = cv2.cvtColor(work_img, cv2.COLOR_BGR2GRAY) h, w = gray.shape k_close = np.ones((15, 15), np.uint8) k_open = np.ones((5, 5), np.uint8) strats = [] blurred = cv2.GaussianBlur(gray, (15, 15), 0) _, otsu = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) otsu = cv2.morphologyEx(otsu, cv2.MORPH_CLOSE, k_close, iterations=3) otsu = cv2.morphologyEx(otsu, cv2.MORPH_OPEN, k_open, iterations=1) strats.append(("Otsu", otsu)) hsv = cv2.cvtColor(work_img, cv2.COLOR_BGR2HSV) v_ch = cv2.GaussianBlur(hsv[:, :, 2], (15, 15), 0) _, v_t = cv2.threshold(v_ch, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) v_t = cv2.morphologyEx(v_t, cv2.MORPH_CLOSE, k_close, iterations=3) v_t = cv2.morphologyEx(v_t, cv2.MORPH_OPEN, k_open, iterations=1) strats.append(("HSV-V", v_t)) bilateral = cv2.bilateralFilter(gray, 9, 75, 75) bilateral = cv2.GaussianBlur(bilateral, (11, 11), 0) _, bil_t = cv2.threshold(bilateral, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_CLOSE, k_close, iterations=3) bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_OPEN, k_open, iterations=1) strats.append(("Bilateral", bil_t)) b2 = cv2.GaussianBlur(gray, (9, 9), 0) edges = cv2.Canny(b2, 25, 80) edges = cv2.dilate(edges, np.ones((7, 7), np.uint8), iterations=3) edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, np.ones((13, 13), np.uint8), iterations=2) flood = edges.copy() fmask = np.zeros((h + 2, w + 2), np.uint8) step = max(1, min(w, h) // 20) for x in range(0, w, step): if flood[0, x] == 0: cv2.floodFill(flood, fmask, (x, 0), 128) if flood[h - 1, x] == 0: cv2.floodFill(flood, fmask, (x, h - 1), 128) for y in range(0, h, step): if flood[y, 0] == 0: cv2.floodFill(flood, fmask, (0, y), 128) if flood[y, w - 1] == 0: cv2.floodFill(flood, fmask, (w - 1, y), 128) doc = np.where(flood == 128, 0, 255).astype(np.uint8) doc = cv2.morphologyEx(doc, cv2.MORPH_CLOSE, k_close, iterations=2) strats.append(("FloodFill", doc)) return strats def find_notebook_contour(work_img): strategies = get_binary_strategies(work_img) img_area = work_img.shape[0] * work_img.shape[1] best_quad = None best_area = 0 all_quads = [] is_fallback = False max_cnt = None max_cnt_area = 0 for name, binary in strategies: contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5] for cnt in contours: area = cv2.contourArea(cnt) if area > max_cnt_area: max_cnt_area = area max_cnt = cnt if area < 0.15 * img_area: continue peri = cv2.arcLength(cnt, True) for eps in np.linspace(0.01, 0.1, 20): approx = cv2.approxPolyDP(cnt, eps * peri, True) if len(approx) == 4: q = approx.reshape(4, 2).astype(np.float32) if is_valid_quad(q, work_img.shape): all_quads.append(q) if area > best_area: best_area = area best_quad = q break elif len(approx) < 4: break hull = cv2.convexHull(cnt) peri_h = cv2.arcLength(hull, True) for eps in np.linspace(0.01, 0.1, 20): approx = cv2.approxPolyDP(hull, eps * peri_h, True) if len(approx) == 4: q = approx.reshape(4, 2).astype(np.float32) if is_valid_quad(q, work_img.shape): all_quads.append(q) if area > best_area: best_area = area best_quad = q break elif len(approx) < 4: break if area > 0.20 * img_area: box = cv2.boxPoints(cv2.minAreaRect(cnt)).astype(np.float32) if is_valid_quad(box, work_img.shape): all_quads.append(box) if area * 0.90 > best_area: best_area = area * 0.90 best_quad = box if best_quad is None and max_cnt is not None \ and max_cnt_area > 0.10 * img_area: box = cv2.boxPoints(cv2.minAreaRect(max_cnt)).astype(np.float32) best_quad = box all_quads.append(box) is_fallback = True return best_quad, all_quads, is_fallback def draw_debug_image(work_img, corners, all_quads, is_fallback): debug = work_img.copy() h, w = debug.shape[:2] for q in all_quads: cv2.polylines(debug, [q.astype(np.int32)], True, (0, 255, 255), 1) if corners is not None: color = (0, 165, 255) if is_fallback else (0, 255, 0) cv2.polylines(debug, [corners.astype(np.int32)], True, color, 3) ordered = order_points(corners) for i, (pt, lbl, c) in enumerate(zip( ordered, ["TL","TR","BR","BL"], [(255,0,0),(0,0,255),(255,0,255),(0,255,0)])): cx, cy = int(pt[0]), int(pt[1]) cv2.circle(debug, (cx, cy), 8, c, -1) cv2.putText(debug, lbl, (cx+10, cy+5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, c, 2) cv2.rectangle(debug, (0, 0), (w, 40), (0, 0, 0), -1) if corners is not None: s, c = ("FALLBACK", (0,165,255)) if is_fallback \ else ("QUAD DETECTED (green outline)", (0,255,0)) else: s, c = "NOTHING DETECTED", (0, 0, 255) cv2.putText(debug, s, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2) return debug def save_binary_debug(work_img, debug_path): strategies = get_binary_strategies(work_img) panels = [] tw = 300 for name, pan in strategies: r = tw / pan.shape[1] res = cv2.resize(pan, (tw, int(pan.shape[0] * r))) cp = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR) cv2.putText(cp, name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) panels.append(cp) mh = max(p.shape[0] for p in panels) padded = [] for p in panels: if p.shape[0] < mh: p = np.vstack([p, np.zeros((mh - p.shape[0], p.shape[1], 3), np.uint8)]) padded.append(p) cv2.imwrite(debug_path.replace("_debug.", "_binary_debug."), np.hstack(padded), [cv2.IMWRITE_JPEG_QUALITY, 85]) def get_rotation_from_gemini(image_bytes: bytes) -> str: api_key = os.environ.get("GEMINI_API_KEY") if not api_key: print("[WARN] GEMINI_API_KEY not set. Defaulting to 90_counterclockwise", flush=True) return "90_counterclockwise" client = genai.Client(api_key=api_key) model = "gemini-3.1-flash-lite-preview" contents = [ types.Content( role="user", parts=[ # Defaulting to image/jpeg, handles most cases types.Part.from_bytes(mime_type="image/jpeg", data=image_bytes), ], ), types.Content( role="model", parts=[ types.Part.from_text(text="""```json\n{"rotation": "0"}\n```"""), ], ), types.Content( role="user", parts=[ types.Part.from_text(text="""Determine the rotation needed to make this image readable."""), ], ), ] generate_content_config = types.GenerateContentConfig( system_instruction=[ types.Part.from_text(text='''you are the AI which detects which orientation the image should be rotated such that the text becomes readable. output strict json: {"rotation": "90_counterclockwise", "90_clockwise", "180", "0"}'''), ], temperature=0.0 ) try: response = client.models.generate_content( model=model, contents=contents, config=generate_content_config, ) text = response.text if "```json" in text: text = text.split("```json")[1].split("```")[0].strip() elif "```" in text: text = text.split("```")[1].split("```")[0].strip() data = json.loads(text) return data.get("rotation", "0") except Exception as e: print(f"[ERROR] Gemini rotation detection failed: {e}", flush=True) return "90_counterclockwise" def process_image(input_path: str): script_dir = os.path.dirname(os.path.abspath(__file__)) image = cv2.imread(input_path) if image is None: print(f"[ERROR] Cannot read: {input_path}") return with open(input_path, "rb") as f: image_bytes = f.read() rotation_str = get_rotation_from_gemini(image_bytes) print(f"[INFO] Gemini detected rotation: {rotation_str}", flush=True) if rotation_str == "90_counterclockwise": rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) elif rotation_str == "90_clockwise": rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) elif rotation_str == "180": rotated = cv2.rotate(image, cv2.ROTATE_180) else: rotated = image orig_h, orig_w = rotated.shape[:2] max_dim = 800.0 ratio = max(orig_h, orig_w) / max_dim work_w = int(orig_w / ratio) work_h = int(orig_h / ratio) work_img = cv2.resize(rotated, (work_w, work_h)) corners, all_quads, is_fallback = find_notebook_contour(work_img) stem = Path(input_path).stem debug_path = os.path.join(script_dir, f"{stem}_debug.jpg") if corners is not None: corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025) scale_x = orig_w / work_w scale_y = orig_h / work_h corners_orig = corners_exp.copy() corners_orig[:, 0] *= scale_x corners_orig[:, 1] *= scale_y corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1) corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1) cropped = four_point_transform(rotated, corners_orig) print("[INFO] Success! Applied crop.") else: print("[WARN] Total failure. Returning full rotated image.") cropped = rotated debug_img = draw_debug_image(work_img, corners, all_quads, is_fallback) save_binary_debug(work_img, debug_path) cv2.imwrite(debug_path, debug_img, [cv2.IMWRITE_JPEG_QUALITY, 90]) out_path = os.path.join(script_dir, f"{stem}_cropped.jpg") cv2.imwrite(out_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95]) print(f"[INFO] Saved cropped: {out_path}") if __name__ == "__main__": if len(sys.argv) < 2: script_dir = os.path.dirname(os.path.abspath(__file__)) exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp") skip = ("_cropped", "_debug", "_binary_debug") files = [f for f in os.listdir(script_dir) if f.lower().endswith(exts) and not any(s in f for s in skip)] if not files: print("Place images next to script or provide paths.") sys.exit(1) for fn in sorted(files): print(f"\nProcessing: {fn}") process_image(os.path.join(script_dir, fn)) else: for p in sys.argv[1:]: print(f"\nProcessing: {p}") process_image(p) def auto_crop_process(image_bytes: bytes) -> bytes: """ Exact logic from processor.py, but for in-memory bytes. 1. Decode JPEG/PNG bytes. 2. Rotate 90 deg CCW. 3. Detect and crop. 4. Return JPEG bytes. """ nparr = np.frombuffer(image_bytes, np.uint8) image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) if image is None: return image_bytes # 1. Rotate rotation_str = get_rotation_from_gemini(image_bytes) print(f"[PROCESS] Gemini detected rotation: {rotation_str}", flush=True) if rotation_str == "90_counterclockwise": rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE) elif rotation_str == "90_clockwise": rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE) elif rotation_str == "180": rotated = cv2.rotate(image, cv2.ROTATE_180) else: rotated = image orig_h, orig_w = rotated.shape[:2] # 2. Resize for detection max_dim = 800.0 ratio = max(orig_h, orig_w) / max_dim work_w = int(orig_w / ratio) work_h = int(orig_h / ratio) work_img = cv2.resize(rotated, (work_w, work_h)) # 3. Find contour corners, all_quads, is_fallback = find_notebook_contour(work_img) # 4. Transform if corners is not None: corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025) scale_x = orig_w / work_w scale_y = orig_h / work_h corners_orig = corners_exp.copy() corners_orig[:, 0] *= scale_x corners_orig[:, 1] *= scale_y corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1) corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1) cropped = four_point_transform(rotated, corners_orig) else: cropped = rotated # 5. Encode back to bytes _, result_bytes = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95]) return result_bytes.tobytes()