Spaces:

SolarumAsteridion
/

backend

Running

File size: 15,930 Bytes

#!/usr/bin/env python3
"""
Notebook Auto-Crop Tool v5 — Tight-Crop Fix
"""

import cv2
import numpy as np
import sys
import os
import json
from pathlib import Path
from google import genai
from google.genai import types


def order_points(pts):
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect


def four_point_transform(image, pts):
    rect = order_points(pts)
    (tl, tr, br, bl) = rect
    maxW = max(int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl))), 1)
    maxH = max(int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl))), 1)
    dst = np.array([[0, 0], [maxW-1, 0], [maxW-1, maxH-1], [0, maxH-1]], dtype="float32")
    M = cv2.getPerspectiveTransform(rect, dst)
    return cv2.warpPerspective(image, M, (maxW, maxH))


def is_valid_quad(quad, img_shape):
    ordered = order_points(quad.astype(np.float32))
    for i in range(4):
        v1 = ordered[(i - 1) % 4] - ordered[i]
        v2 = ordered[(i + 1) % 4] - ordered[i]
        denom = np.linalg.norm(v1) * np.linalg.norm(v2)
        if denom < 1e-6:
            return False
        angle = np.degrees(np.arccos(np.clip(np.dot(v1, v2) / denom, -1, 1)))
        if angle < 30 or angle > 150:
            return False
    w1 = np.linalg.norm(ordered[1] - ordered[0])
    w2 = np.linalg.norm(ordered[2] - ordered[3])
    h1 = np.linalg.norm(ordered[3] - ordered[0])
    h2 = np.linalg.norm(ordered[2] - ordered[1])
    avg_w, avg_h = (w1 + w2) / 2, (h1 + h2) / 2
    if min(avg_w, avg_h) < 1:
        return False
    return max(avg_w, avg_h) / min(avg_w, avg_h) <= 5.0


def expand_quad(quad, img_shape, margin_frac=0.025):
    center = quad.mean(axis=0)
    expanded = quad.copy().astype(np.float32)
    for i in range(len(quad)):
        vec = quad[i] - center
        expanded[i] = quad[i] + vec * margin_frac
    h, w = img_shape[:2]
    expanded[:, 0] = np.clip(expanded[:, 0], 0, w - 1)
    expanded[:, 1] = np.clip(expanded[:, 1], 0, h - 1)
    return expanded


def get_binary_strategies(work_img):
    gray = cv2.cvtColor(work_img, cv2.COLOR_BGR2GRAY)
    h, w = gray.shape
    k_close = np.ones((15, 15), np.uint8)
    k_open  = np.ones((5, 5),   np.uint8)
    strats  = []

    blurred = cv2.GaussianBlur(gray, (15, 15), 0)
    _, otsu = cv2.threshold(blurred, 0, 255,
                            cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    otsu = cv2.morphologyEx(otsu, cv2.MORPH_CLOSE, k_close, iterations=3)
    otsu = cv2.morphologyEx(otsu, cv2.MORPH_OPEN, k_open, iterations=1)
    strats.append(("Otsu", otsu))

    hsv = cv2.cvtColor(work_img, cv2.COLOR_BGR2HSV)
    v_ch = cv2.GaussianBlur(hsv[:, :, 2], (15, 15), 0)
    _, v_t = cv2.threshold(v_ch, 0, 255,
                           cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    v_t = cv2.morphologyEx(v_t, cv2.MORPH_CLOSE, k_close, iterations=3)
    v_t = cv2.morphologyEx(v_t, cv2.MORPH_OPEN, k_open, iterations=1)
    strats.append(("HSV-V", v_t))

    bilateral = cv2.bilateralFilter(gray, 9, 75, 75)
    bilateral = cv2.GaussianBlur(bilateral, (11, 11), 0)
    _, bil_t = cv2.threshold(bilateral, 0, 255,
                             cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_CLOSE, k_close, iterations=3)
    bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_OPEN, k_open, iterations=1)
    strats.append(("Bilateral", bil_t))

    b2    = cv2.GaussianBlur(gray, (9, 9), 0)
    edges = cv2.Canny(b2, 25, 80)
    edges = cv2.dilate(edges, np.ones((7, 7), np.uint8), iterations=3)
    edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE,
                             np.ones((13, 13), np.uint8), iterations=2)
    flood = edges.copy()
    fmask = np.zeros((h + 2, w + 2), np.uint8)
    step  = max(1, min(w, h) // 20)
    for x in range(0, w, step):
        if flood[0, x] == 0:
            cv2.floodFill(flood, fmask, (x, 0), 128)
        if flood[h - 1, x] == 0:
            cv2.floodFill(flood, fmask, (x, h - 1), 128)
    for y in range(0, h, step):
        if flood[y, 0] == 0:
            cv2.floodFill(flood, fmask, (0, y), 128)
        if flood[y, w - 1] == 0:
            cv2.floodFill(flood, fmask, (w - 1, y), 128)
    doc = np.where(flood == 128, 0, 255).astype(np.uint8)
    doc = cv2.morphologyEx(doc, cv2.MORPH_CLOSE, k_close, iterations=2)
    strats.append(("FloodFill", doc))

    return strats


def find_notebook_contour(work_img):
    strategies   = get_binary_strategies(work_img)
    img_area     = work_img.shape[0] * work_img.shape[1]
    best_quad    = None
    best_area    = 0
    all_quads    = []
    is_fallback  = False
    max_cnt      = None
    max_cnt_area = 0

    for name, binary in strategies:
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL,
                                       cv2.CHAIN_APPROX_SIMPLE)
        contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]

        for cnt in contours:
            area = cv2.contourArea(cnt)
            if area > max_cnt_area:
                max_cnt_area = area
                max_cnt      = cnt
            if area < 0.15 * img_area:
                continue

            peri = cv2.arcLength(cnt, True)

            for eps in np.linspace(0.01, 0.1, 20):
                approx = cv2.approxPolyDP(cnt, eps * peri, True)
                if len(approx) == 4:
                    q = approx.reshape(4, 2).astype(np.float32)
                    if is_valid_quad(q, work_img.shape):
                        all_quads.append(q)
                        if area > best_area:
                            best_area = area
                            best_quad = q
                    break
                elif len(approx) < 4:
                    break

            hull   = cv2.convexHull(cnt)
            peri_h = cv2.arcLength(hull, True)
            for eps in np.linspace(0.01, 0.1, 20):
                approx = cv2.approxPolyDP(hull, eps * peri_h, True)
                if len(approx) == 4:
                    q = approx.reshape(4, 2).astype(np.float32)
                    if is_valid_quad(q, work_img.shape):
                        all_quads.append(q)
                        if area > best_area:
                            best_area = area
                            best_quad = q
                    break
                elif len(approx) < 4:
                    break

            if area > 0.20 * img_area:
                box = cv2.boxPoints(cv2.minAreaRect(cnt)).astype(np.float32)
                if is_valid_quad(box, work_img.shape):
                    all_quads.append(box)
                    if area * 0.90 > best_area:
                        best_area = area * 0.90
                        best_quad = box

    if best_quad is None and max_cnt is not None \
       and max_cnt_area > 0.10 * img_area:
        box = cv2.boxPoints(cv2.minAreaRect(max_cnt)).astype(np.float32)
        best_quad   = box
        all_quads.append(box)
        is_fallback = True

    return best_quad, all_quads, is_fallback


def draw_debug_image(work_img, corners, all_quads, is_fallback):
    debug = work_img.copy()
    h, w  = debug.shape[:2]
    for q in all_quads:
        cv2.polylines(debug, [q.astype(np.int32)], True, (0, 255, 255), 1)
    if corners is not None:
        color = (0, 165, 255) if is_fallback else (0, 255, 0)
        cv2.polylines(debug, [corners.astype(np.int32)], True, color, 3)
        ordered = order_points(corners)
        for i, (pt, lbl, c) in enumerate(zip(
                ordered, ["TL","TR","BR","BL"],
                [(255,0,0),(0,0,255),(255,0,255),(0,255,0)])):
            cx, cy = int(pt[0]), int(pt[1])
            cv2.circle(debug, (cx, cy), 8, c, -1)
            cv2.putText(debug, lbl, (cx+10, cy+5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, c, 2)
    cv2.rectangle(debug, (0, 0), (w, 40), (0, 0, 0), -1)
    if corners is not None:
        s, c = ("FALLBACK", (0,165,255)) if is_fallback \
               else ("QUAD DETECTED (green outline)", (0,255,0))
    else:
        s, c = "NOTHING DETECTED", (0, 0, 255)
    cv2.putText(debug, s, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2)
    return debug


def save_binary_debug(work_img, debug_path):
    strategies = get_binary_strategies(work_img)
    panels = []
    tw = 300
    for name, pan in strategies:
        r   = tw / pan.shape[1]
        res = cv2.resize(pan, (tw, int(pan.shape[0] * r)))
        cp  = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)
        cv2.putText(cp, name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
                    (0, 255, 0), 2)
        panels.append(cp)
    mh = max(p.shape[0] for p in panels)
    padded = []
    for p in panels:
        if p.shape[0] < mh:
            p = np.vstack([p, np.zeros((mh - p.shape[0], p.shape[1], 3),
                                       np.uint8)])
        padded.append(p)
    cv2.imwrite(debug_path.replace("_debug.", "_binary_debug."),
                np.hstack(padded), [cv2.IMWRITE_JPEG_QUALITY, 85])


def get_rotation_from_gemini(image_bytes: bytes) -> str:
    api_key = os.environ.get("GEMINI_API_KEY")
    if not api_key:
        print("[WARN] GEMINI_API_KEY not set. Defaulting to 90_counterclockwise", flush=True)
        return "90_counterclockwise"
        
    client = genai.Client(api_key=api_key)
    model = "gemini-3.1-flash-lite-preview"
    
    contents = [
        types.Content(
            role="user",
            parts=[
                # Defaulting to image/jpeg, handles most cases
                types.Part.from_bytes(mime_type="image/jpeg", data=image_bytes),
            ],
        ),
        types.Content(
            role="model",
            parts=[
                types.Part.from_text(text="""```json\n{"rotation": "0"}\n```"""),
            ],
        ),
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text="""Determine the rotation needed to make this image readable."""),
            ],
        ),
    ]
    
    generate_content_config = types.GenerateContentConfig(
        system_instruction=[
            types.Part.from_text(text='''you are the AI which detects which orientation the image should be rotated such that the text becomes readable.
output strict json:
{"rotation": "90_counterclockwise", "90_clockwise", "180", "0"}'''),
        ],
        temperature=0.0
    )
    
    try:
        response = client.models.generate_content(
            model=model,
            contents=contents,
            config=generate_content_config,
        )
        text = response.text
        if "```json" in text:
            text = text.split("```json")[1].split("```")[0].strip()
        elif "```" in text:
            text = text.split("```")[1].split("```")[0].strip()
            
        data = json.loads(text)
        return data.get("rotation", "0")
    except Exception as e:
        print(f"[ERROR] Gemini rotation detection failed: {e}", flush=True)
        return "90_counterclockwise"

def process_image(input_path: str):
    script_dir = os.path.dirname(os.path.abspath(__file__))
    image = cv2.imread(input_path)
    if image is None:
        print(f"[ERROR] Cannot read: {input_path}")
        return

    with open(input_path, "rb") as f:
        image_bytes = f.read()
        
    rotation_str = get_rotation_from_gemini(image_bytes)
    print(f"[INFO] Gemini detected rotation: {rotation_str}", flush=True)
    
    if rotation_str == "90_counterclockwise":
        rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif rotation_str == "90_clockwise":
        rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
    elif rotation_str == "180":
        rotated = cv2.rotate(image, cv2.ROTATE_180)
    else:
        rotated = image

    orig_h, orig_w = rotated.shape[:2]

    max_dim  = 800.0
    ratio    = max(orig_h, orig_w) / max_dim
    work_w   = int(orig_w / ratio)
    work_h   = int(orig_h / ratio)
    work_img = cv2.resize(rotated, (work_w, work_h))

    corners, all_quads, is_fallback = find_notebook_contour(work_img)
    stem       = Path(input_path).stem
    debug_path = os.path.join(script_dir, f"{stem}_debug.jpg")

    if corners is not None:
        corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025)

        scale_x = orig_w / work_w
        scale_y = orig_h / work_h
        corners_orig = corners_exp.copy()
        corners_orig[:, 0] *= scale_x
        corners_orig[:, 1] *= scale_y
        corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1)
        corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1)

        cropped = four_point_transform(rotated, corners_orig)
        print("[INFO] Success! Applied crop.")
    else:
        print("[WARN] Total failure. Returning full rotated image.")
        cropped = rotated

    debug_img = draw_debug_image(work_img, corners, all_quads, is_fallback)
    save_binary_debug(work_img, debug_path)
    cv2.imwrite(debug_path, debug_img, [cv2.IMWRITE_JPEG_QUALITY, 90])

    out_path = os.path.join(script_dir, f"{stem}_cropped.jpg")
    cv2.imwrite(out_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
    print(f"[INFO] Saved cropped: {out_path}")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        script_dir = os.path.dirname(os.path.abspath(__file__))
        exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
        skip = ("_cropped", "_debug", "_binary_debug")
        files = [f for f in os.listdir(script_dir)
                 if f.lower().endswith(exts)
                 and not any(s in f for s in skip)]
        if not files:
            print("Place images next to script or provide paths.")
            sys.exit(1)
        for fn in sorted(files):
            print(f"\nProcessing: {fn}")
            process_image(os.path.join(script_dir, fn))
    else:
        for p in sys.argv[1:]:
            print(f"\nProcessing: {p}")
            process_image(p)


def auto_crop_process(image_bytes: bytes) -> bytes:
    """
    Exact logic from processor.py, but for in-memory bytes.
    1. Decode JPEG/PNG bytes.
    2. Rotate 90 deg CCW.
    3. Detect and crop.
    4. Return JPEG bytes.
    """
    nparr = np.frombuffer(image_bytes, np.uint8)
    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    if image is None:
        return image_bytes

    # 1. Rotate
    rotation_str = get_rotation_from_gemini(image_bytes)
    print(f"[PROCESS] Gemini detected rotation: {rotation_str}", flush=True)
    
    if rotation_str == "90_counterclockwise":
        rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
    elif rotation_str == "90_clockwise":
        rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
    elif rotation_str == "180":
        rotated = cv2.rotate(image, cv2.ROTATE_180)
    else:
        rotated = image
    orig_h, orig_w = rotated.shape[:2]

    # 2. Resize for detection
    max_dim = 800.0
    ratio = max(orig_h, orig_w) / max_dim
    work_w = int(orig_w / ratio)
    work_h = int(orig_h / ratio)
    work_img = cv2.resize(rotated, (work_w, work_h))

    # 3. Find contour
    corners, all_quads, is_fallback = find_notebook_contour(work_img)

    # 4. Transform
    if corners is not None:
        corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025)

        scale_x = orig_w / work_w
        scale_y = orig_h / work_h
        corners_orig = corners_exp.copy()
        corners_orig[:, 0] *= scale_x
        corners_orig[:, 1] *= scale_y
        corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1)
        corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1)

        cropped = four_point_transform(rotated, corners_orig)
    else:
        cropped = rotated

    # 5. Encode back to bytes
    _, result_bytes = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
    return result_bytes.tobytes()