backend / processor.py
SolarumAsteridion's picture
Fix ThinkingConfig validation error
617884b
#!/usr/bin/env python3
"""
Notebook Auto-Crop Tool v5 — Tight-Crop Fix
"""
import cv2
import numpy as np
import sys
import os
import json
from pathlib import Path
from google import genai
from google.genai import types
def order_points(pts):
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def four_point_transform(image, pts):
rect = order_points(pts)
(tl, tr, br, bl) = rect
maxW = max(int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl))), 1)
maxH = max(int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl))), 1)
dst = np.array([[0, 0], [maxW-1, 0], [maxW-1, maxH-1], [0, maxH-1]], dtype="float32")
M = cv2.getPerspectiveTransform(rect, dst)
return cv2.warpPerspective(image, M, (maxW, maxH))
def is_valid_quad(quad, img_shape):
ordered = order_points(quad.astype(np.float32))
for i in range(4):
v1 = ordered[(i - 1) % 4] - ordered[i]
v2 = ordered[(i + 1) % 4] - ordered[i]
denom = np.linalg.norm(v1) * np.linalg.norm(v2)
if denom < 1e-6:
return False
angle = np.degrees(np.arccos(np.clip(np.dot(v1, v2) / denom, -1, 1)))
if angle < 30 or angle > 150:
return False
w1 = np.linalg.norm(ordered[1] - ordered[0])
w2 = np.linalg.norm(ordered[2] - ordered[3])
h1 = np.linalg.norm(ordered[3] - ordered[0])
h2 = np.linalg.norm(ordered[2] - ordered[1])
avg_w, avg_h = (w1 + w2) / 2, (h1 + h2) / 2
if min(avg_w, avg_h) < 1:
return False
return max(avg_w, avg_h) / min(avg_w, avg_h) <= 5.0
def expand_quad(quad, img_shape, margin_frac=0.025):
center = quad.mean(axis=0)
expanded = quad.copy().astype(np.float32)
for i in range(len(quad)):
vec = quad[i] - center
expanded[i] = quad[i] + vec * margin_frac
h, w = img_shape[:2]
expanded[:, 0] = np.clip(expanded[:, 0], 0, w - 1)
expanded[:, 1] = np.clip(expanded[:, 1], 0, h - 1)
return expanded
def get_binary_strategies(work_img):
gray = cv2.cvtColor(work_img, cv2.COLOR_BGR2GRAY)
h, w = gray.shape
k_close = np.ones((15, 15), np.uint8)
k_open = np.ones((5, 5), np.uint8)
strats = []
blurred = cv2.GaussianBlur(gray, (15, 15), 0)
_, otsu = cv2.threshold(blurred, 0, 255,
cv2.THRESH_BINARY + cv2.THRESH_OTSU)
otsu = cv2.morphologyEx(otsu, cv2.MORPH_CLOSE, k_close, iterations=3)
otsu = cv2.morphologyEx(otsu, cv2.MORPH_OPEN, k_open, iterations=1)
strats.append(("Otsu", otsu))
hsv = cv2.cvtColor(work_img, cv2.COLOR_BGR2HSV)
v_ch = cv2.GaussianBlur(hsv[:, :, 2], (15, 15), 0)
_, v_t = cv2.threshold(v_ch, 0, 255,
cv2.THRESH_BINARY + cv2.THRESH_OTSU)
v_t = cv2.morphologyEx(v_t, cv2.MORPH_CLOSE, k_close, iterations=3)
v_t = cv2.morphologyEx(v_t, cv2.MORPH_OPEN, k_open, iterations=1)
strats.append(("HSV-V", v_t))
bilateral = cv2.bilateralFilter(gray, 9, 75, 75)
bilateral = cv2.GaussianBlur(bilateral, (11, 11), 0)
_, bil_t = cv2.threshold(bilateral, 0, 255,
cv2.THRESH_BINARY + cv2.THRESH_OTSU)
bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_CLOSE, k_close, iterations=3)
bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_OPEN, k_open, iterations=1)
strats.append(("Bilateral", bil_t))
b2 = cv2.GaussianBlur(gray, (9, 9), 0)
edges = cv2.Canny(b2, 25, 80)
edges = cv2.dilate(edges, np.ones((7, 7), np.uint8), iterations=3)
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE,
np.ones((13, 13), np.uint8), iterations=2)
flood = edges.copy()
fmask = np.zeros((h + 2, w + 2), np.uint8)
step = max(1, min(w, h) // 20)
for x in range(0, w, step):
if flood[0, x] == 0:
cv2.floodFill(flood, fmask, (x, 0), 128)
if flood[h - 1, x] == 0:
cv2.floodFill(flood, fmask, (x, h - 1), 128)
for y in range(0, h, step):
if flood[y, 0] == 0:
cv2.floodFill(flood, fmask, (0, y), 128)
if flood[y, w - 1] == 0:
cv2.floodFill(flood, fmask, (w - 1, y), 128)
doc = np.where(flood == 128, 0, 255).astype(np.uint8)
doc = cv2.morphologyEx(doc, cv2.MORPH_CLOSE, k_close, iterations=2)
strats.append(("FloodFill", doc))
return strats
def find_notebook_contour(work_img):
strategies = get_binary_strategies(work_img)
img_area = work_img.shape[0] * work_img.shape[1]
best_quad = None
best_area = 0
all_quads = []
is_fallback = False
max_cnt = None
max_cnt_area = 0
for name, binary in strategies:
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
for cnt in contours:
area = cv2.contourArea(cnt)
if area > max_cnt_area:
max_cnt_area = area
max_cnt = cnt
if area < 0.15 * img_area:
continue
peri = cv2.arcLength(cnt, True)
for eps in np.linspace(0.01, 0.1, 20):
approx = cv2.approxPolyDP(cnt, eps * peri, True)
if len(approx) == 4:
q = approx.reshape(4, 2).astype(np.float32)
if is_valid_quad(q, work_img.shape):
all_quads.append(q)
if area > best_area:
best_area = area
best_quad = q
break
elif len(approx) < 4:
break
hull = cv2.convexHull(cnt)
peri_h = cv2.arcLength(hull, True)
for eps in np.linspace(0.01, 0.1, 20):
approx = cv2.approxPolyDP(hull, eps * peri_h, True)
if len(approx) == 4:
q = approx.reshape(4, 2).astype(np.float32)
if is_valid_quad(q, work_img.shape):
all_quads.append(q)
if area > best_area:
best_area = area
best_quad = q
break
elif len(approx) < 4:
break
if area > 0.20 * img_area:
box = cv2.boxPoints(cv2.minAreaRect(cnt)).astype(np.float32)
if is_valid_quad(box, work_img.shape):
all_quads.append(box)
if area * 0.90 > best_area:
best_area = area * 0.90
best_quad = box
if best_quad is None and max_cnt is not None \
and max_cnt_area > 0.10 * img_area:
box = cv2.boxPoints(cv2.minAreaRect(max_cnt)).astype(np.float32)
best_quad = box
all_quads.append(box)
is_fallback = True
return best_quad, all_quads, is_fallback
def draw_debug_image(work_img, corners, all_quads, is_fallback):
debug = work_img.copy()
h, w = debug.shape[:2]
for q in all_quads:
cv2.polylines(debug, [q.astype(np.int32)], True, (0, 255, 255), 1)
if corners is not None:
color = (0, 165, 255) if is_fallback else (0, 255, 0)
cv2.polylines(debug, [corners.astype(np.int32)], True, color, 3)
ordered = order_points(corners)
for i, (pt, lbl, c) in enumerate(zip(
ordered, ["TL","TR","BR","BL"],
[(255,0,0),(0,0,255),(255,0,255),(0,255,0)])):
cx, cy = int(pt[0]), int(pt[1])
cv2.circle(debug, (cx, cy), 8, c, -1)
cv2.putText(debug, lbl, (cx+10, cy+5),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, c, 2)
cv2.rectangle(debug, (0, 0), (w, 40), (0, 0, 0), -1)
if corners is not None:
s, c = ("FALLBACK", (0,165,255)) if is_fallback \
else ("QUAD DETECTED (green outline)", (0,255,0))
else:
s, c = "NOTHING DETECTED", (0, 0, 255)
cv2.putText(debug, s, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2)
return debug
def save_binary_debug(work_img, debug_path):
strategies = get_binary_strategies(work_img)
panels = []
tw = 300
for name, pan in strategies:
r = tw / pan.shape[1]
res = cv2.resize(pan, (tw, int(pan.shape[0] * r)))
cp = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)
cv2.putText(cp, name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
(0, 255, 0), 2)
panels.append(cp)
mh = max(p.shape[0] for p in panels)
padded = []
for p in panels:
if p.shape[0] < mh:
p = np.vstack([p, np.zeros((mh - p.shape[0], p.shape[1], 3),
np.uint8)])
padded.append(p)
cv2.imwrite(debug_path.replace("_debug.", "_binary_debug."),
np.hstack(padded), [cv2.IMWRITE_JPEG_QUALITY, 85])
def get_rotation_from_gemini(image_bytes: bytes) -> str:
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
print("[WARN] GEMINI_API_KEY not set. Defaulting to 90_counterclockwise", flush=True)
return "90_counterclockwise"
client = genai.Client(api_key=api_key)
model = "gemini-3.1-flash-lite-preview"
contents = [
types.Content(
role="user",
parts=[
# Defaulting to image/jpeg, handles most cases
types.Part.from_bytes(mime_type="image/jpeg", data=image_bytes),
],
),
types.Content(
role="model",
parts=[
types.Part.from_text(text="""```json\n{"rotation": "0"}\n```"""),
],
),
types.Content(
role="user",
parts=[
types.Part.from_text(text="""Determine the rotation needed to make this image readable."""),
],
),
]
generate_content_config = types.GenerateContentConfig(
system_instruction=[
types.Part.from_text(text='''you are the AI which detects which orientation the image should be rotated such that the text becomes readable.
output strict json:
{"rotation": "90_counterclockwise", "90_clockwise", "180", "0"}'''),
],
temperature=0.0
)
try:
response = client.models.generate_content(
model=model,
contents=contents,
config=generate_content_config,
)
text = response.text
if "```json" in text:
text = text.split("```json")[1].split("```")[0].strip()
elif "```" in text:
text = text.split("```")[1].split("```")[0].strip()
data = json.loads(text)
return data.get("rotation", "0")
except Exception as e:
print(f"[ERROR] Gemini rotation detection failed: {e}", flush=True)
return "90_counterclockwise"
def process_image(input_path: str):
script_dir = os.path.dirname(os.path.abspath(__file__))
image = cv2.imread(input_path)
if image is None:
print(f"[ERROR] Cannot read: {input_path}")
return
with open(input_path, "rb") as f:
image_bytes = f.read()
rotation_str = get_rotation_from_gemini(image_bytes)
print(f"[INFO] Gemini detected rotation: {rotation_str}", flush=True)
if rotation_str == "90_counterclockwise":
rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
elif rotation_str == "90_clockwise":
rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
elif rotation_str == "180":
rotated = cv2.rotate(image, cv2.ROTATE_180)
else:
rotated = image
orig_h, orig_w = rotated.shape[:2]
max_dim = 800.0
ratio = max(orig_h, orig_w) / max_dim
work_w = int(orig_w / ratio)
work_h = int(orig_h / ratio)
work_img = cv2.resize(rotated, (work_w, work_h))
corners, all_quads, is_fallback = find_notebook_contour(work_img)
stem = Path(input_path).stem
debug_path = os.path.join(script_dir, f"{stem}_debug.jpg")
if corners is not None:
corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025)
scale_x = orig_w / work_w
scale_y = orig_h / work_h
corners_orig = corners_exp.copy()
corners_orig[:, 0] *= scale_x
corners_orig[:, 1] *= scale_y
corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1)
corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1)
cropped = four_point_transform(rotated, corners_orig)
print("[INFO] Success! Applied crop.")
else:
print("[WARN] Total failure. Returning full rotated image.")
cropped = rotated
debug_img = draw_debug_image(work_img, corners, all_quads, is_fallback)
save_binary_debug(work_img, debug_path)
cv2.imwrite(debug_path, debug_img, [cv2.IMWRITE_JPEG_QUALITY, 90])
out_path = os.path.join(script_dir, f"{stem}_cropped.jpg")
cv2.imwrite(out_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
print(f"[INFO] Saved cropped: {out_path}")
if __name__ == "__main__":
if len(sys.argv) < 2:
script_dir = os.path.dirname(os.path.abspath(__file__))
exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
skip = ("_cropped", "_debug", "_binary_debug")
files = [f for f in os.listdir(script_dir)
if f.lower().endswith(exts)
and not any(s in f for s in skip)]
if not files:
print("Place images next to script or provide paths.")
sys.exit(1)
for fn in sorted(files):
print(f"\nProcessing: {fn}")
process_image(os.path.join(script_dir, fn))
else:
for p in sys.argv[1:]:
print(f"\nProcessing: {p}")
process_image(p)
def auto_crop_process(image_bytes: bytes) -> bytes:
"""
Exact logic from processor.py, but for in-memory bytes.
1. Decode JPEG/PNG bytes.
2. Rotate 90 deg CCW.
3. Detect and crop.
4. Return JPEG bytes.
"""
nparr = np.frombuffer(image_bytes, np.uint8)
image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
if image is None:
return image_bytes
# 1. Rotate
rotation_str = get_rotation_from_gemini(image_bytes)
print(f"[PROCESS] Gemini detected rotation: {rotation_str}", flush=True)
if rotation_str == "90_counterclockwise":
rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
elif rotation_str == "90_clockwise":
rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
elif rotation_str == "180":
rotated = cv2.rotate(image, cv2.ROTATE_180)
else:
rotated = image
orig_h, orig_w = rotated.shape[:2]
# 2. Resize for detection
max_dim = 800.0
ratio = max(orig_h, orig_w) / max_dim
work_w = int(orig_w / ratio)
work_h = int(orig_h / ratio)
work_img = cv2.resize(rotated, (work_w, work_h))
# 3. Find contour
corners, all_quads, is_fallback = find_notebook_contour(work_img)
# 4. Transform
if corners is not None:
corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025)
scale_x = orig_w / work_w
scale_y = orig_h / work_h
corners_orig = corners_exp.copy()
corners_orig[:, 0] *= scale_x
corners_orig[:, 1] *= scale_y
corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1)
corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1)
cropped = four_point_transform(rotated, corners_orig)
else:
cropped = rotated
# 5. Encode back to bytes
_, result_bytes = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
return result_bytes.tobytes()