Spaces:

SolarumAsteridion
/

backend

Running

App Files Files Community

backend / processor.py

SolarumAsteridion

Fix ThinkingConfig validation error

617884b about 1 month ago

raw

history blame contribute delete

15.9 kB

	#!/usr/bin/env python3
	"""
	Notebook Auto-Crop Tool v5 — Tight-Crop Fix
	"""

	import cv2
	import numpy as np
	import sys
	import os
	import json
	from pathlib import Path
	from google import genai
	from google.genai import types


	def order_points(pts):
	rect = np.zeros((4, 2), dtype="float32")
	s = pts.sum(axis=1)
	rect[0] = pts[np.argmin(s)]
	rect[2] = pts[np.argmax(s)]
	diff = np.diff(pts, axis=1)
	rect[1] = pts[np.argmin(diff)]
	rect[3] = pts[np.argmax(diff)]
	return rect


	def four_point_transform(image, pts):
	rect = order_points(pts)
	(tl, tr, br, bl) = rect
	maxW = max(int(max(np.linalg.norm(br - bl), np.linalg.norm(tr - tl))), 1)
	maxH = max(int(max(np.linalg.norm(tr - br), np.linalg.norm(tl - bl))), 1)
	dst = np.array([[0, 0], [maxW-1, 0], [maxW-1, maxH-1], [0, maxH-1]], dtype="float32")
	M = cv2.getPerspectiveTransform(rect, dst)
	return cv2.warpPerspective(image, M, (maxW, maxH))


	def is_valid_quad(quad, img_shape):
	ordered = order_points(quad.astype(np.float32))
	for i in range(4):
	v1 = ordered[(i - 1) % 4] - ordered[i]
	v2 = ordered[(i + 1) % 4] - ordered[i]
	denom = np.linalg.norm(v1) * np.linalg.norm(v2)
	if denom < 1e-6:
	return False
	angle = np.degrees(np.arccos(np.clip(np.dot(v1, v2) / denom, -1, 1)))
	if angle < 30 or angle > 150:
	return False
	w1 = np.linalg.norm(ordered[1] - ordered[0])
	w2 = np.linalg.norm(ordered[2] - ordered[3])
	h1 = np.linalg.norm(ordered[3] - ordered[0])
	h2 = np.linalg.norm(ordered[2] - ordered[1])
	avg_w, avg_h = (w1 + w2) / 2, (h1 + h2) / 2
	if min(avg_w, avg_h) < 1:
	return False
	return max(avg_w, avg_h) / min(avg_w, avg_h) <= 5.0


	def expand_quad(quad, img_shape, margin_frac=0.025):
	center = quad.mean(axis=0)
	expanded = quad.copy().astype(np.float32)
	for i in range(len(quad)):
	vec = quad[i] - center
	expanded[i] = quad[i] + vec * margin_frac
	h, w = img_shape[:2]
	expanded[:, 0] = np.clip(expanded[:, 0], 0, w - 1)
	expanded[:, 1] = np.clip(expanded[:, 1], 0, h - 1)
	return expanded


	def get_binary_strategies(work_img):
	gray = cv2.cvtColor(work_img, cv2.COLOR_BGR2GRAY)
	h, w = gray.shape
	k_close = np.ones((15, 15), np.uint8)
	k_open = np.ones((5, 5), np.uint8)
	strats = []

	blurred = cv2.GaussianBlur(gray, (15, 15), 0)
	_, otsu = cv2.threshold(blurred, 0, 255,
	cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	otsu = cv2.morphologyEx(otsu, cv2.MORPH_CLOSE, k_close, iterations=3)
	otsu = cv2.morphologyEx(otsu, cv2.MORPH_OPEN, k_open, iterations=1)
	strats.append(("Otsu", otsu))

	hsv = cv2.cvtColor(work_img, cv2.COLOR_BGR2HSV)
	v_ch = cv2.GaussianBlur(hsv[:, :, 2], (15, 15), 0)
	_, v_t = cv2.threshold(v_ch, 0, 255,
	cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	v_t = cv2.morphologyEx(v_t, cv2.MORPH_CLOSE, k_close, iterations=3)
	v_t = cv2.morphologyEx(v_t, cv2.MORPH_OPEN, k_open, iterations=1)
	strats.append(("HSV-V", v_t))

	bilateral = cv2.bilateralFilter(gray, 9, 75, 75)
	bilateral = cv2.GaussianBlur(bilateral, (11, 11), 0)
	_, bil_t = cv2.threshold(bilateral, 0, 255,
	cv2.THRESH_BINARY + cv2.THRESH_OTSU)
	bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_CLOSE, k_close, iterations=3)
	bil_t = cv2.morphologyEx(bil_t, cv2.MORPH_OPEN, k_open, iterations=1)
	strats.append(("Bilateral", bil_t))

	b2 = cv2.GaussianBlur(gray, (9, 9), 0)
	edges = cv2.Canny(b2, 25, 80)
	edges = cv2.dilate(edges, np.ones((7, 7), np.uint8), iterations=3)
	edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE,
	np.ones((13, 13), np.uint8), iterations=2)
	flood = edges.copy()
	fmask = np.zeros((h + 2, w + 2), np.uint8)
	step = max(1, min(w, h) // 20)
	for x in range(0, w, step):
	if flood[0, x] == 0:
	cv2.floodFill(flood, fmask, (x, 0), 128)
	if flood[h - 1, x] == 0:
	cv2.floodFill(flood, fmask, (x, h - 1), 128)
	for y in range(0, h, step):
	if flood[y, 0] == 0:
	cv2.floodFill(flood, fmask, (0, y), 128)
	if flood[y, w - 1] == 0:
	cv2.floodFill(flood, fmask, (w - 1, y), 128)
	doc = np.where(flood == 128, 0, 255).astype(np.uint8)
	doc = cv2.morphologyEx(doc, cv2.MORPH_CLOSE, k_close, iterations=2)
	strats.append(("FloodFill", doc))

	return strats


	def find_notebook_contour(work_img):
	strategies = get_binary_strategies(work_img)
	img_area = work_img.shape[0] * work_img.shape[1]
	best_quad = None
	best_area = 0
	all_quads = []
	is_fallback = False
	max_cnt = None
	max_cnt_area = 0

	for name, binary in strategies:
	contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL,
	cv2.CHAIN_APPROX_SIMPLE)
	contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]

	for cnt in contours:
	area = cv2.contourArea(cnt)
	if area > max_cnt_area:
	max_cnt_area = area
	max_cnt = cnt
	if area < 0.15 * img_area:
	continue

	peri = cv2.arcLength(cnt, True)

	for eps in np.linspace(0.01, 0.1, 20):
	approx = cv2.approxPolyDP(cnt, eps * peri, True)
	if len(approx) == 4:
	q = approx.reshape(4, 2).astype(np.float32)
	if is_valid_quad(q, work_img.shape):
	all_quads.append(q)
	if area > best_area:
	best_area = area
	best_quad = q
	break
	elif len(approx) < 4:
	break

	hull = cv2.convexHull(cnt)
	peri_h = cv2.arcLength(hull, True)
	for eps in np.linspace(0.01, 0.1, 20):
	approx = cv2.approxPolyDP(hull, eps * peri_h, True)
	if len(approx) == 4:
	q = approx.reshape(4, 2).astype(np.float32)
	if is_valid_quad(q, work_img.shape):
	all_quads.append(q)
	if area > best_area:
	best_area = area
	best_quad = q
	break
	elif len(approx) < 4:
	break

	if area > 0.20 * img_area:
	box = cv2.boxPoints(cv2.minAreaRect(cnt)).astype(np.float32)
	if is_valid_quad(box, work_img.shape):
	all_quads.append(box)
	if area * 0.90 > best_area:
	best_area = area * 0.90
	best_quad = box

	if best_quad is None and max_cnt is not None \
	and max_cnt_area > 0.10 * img_area:
	box = cv2.boxPoints(cv2.minAreaRect(max_cnt)).astype(np.float32)
	best_quad = box
	all_quads.append(box)
	is_fallback = True

	return best_quad, all_quads, is_fallback


	def draw_debug_image(work_img, corners, all_quads, is_fallback):
	debug = work_img.copy()
	h, w = debug.shape[:2]
	for q in all_quads:
	cv2.polylines(debug, [q.astype(np.int32)], True, (0, 255, 255), 1)
	if corners is not None:
	color = (0, 165, 255) if is_fallback else (0, 255, 0)
	cv2.polylines(debug, [corners.astype(np.int32)], True, color, 3)
	ordered = order_points(corners)
	for i, (pt, lbl, c) in enumerate(zip(
	ordered, ["TL","TR","BR","BL"],
	[(255,0,0),(0,0,255),(255,0,255),(0,255,0)])):
	cx, cy = int(pt[0]), int(pt[1])
	cv2.circle(debug, (cx, cy), 8, c, -1)
	cv2.putText(debug, lbl, (cx+10, cy+5),
	cv2.FONT_HERSHEY_SIMPLEX, 0.6, c, 2)
	cv2.rectangle(debug, (0, 0), (w, 40), (0, 0, 0), -1)
	if corners is not None:
	s, c = ("FALLBACK", (0,165,255)) if is_fallback \
	else ("QUAD DETECTED (green outline)", (0,255,0))
	else:
	s, c = "NOTHING DETECTED", (0, 0, 255)
	cv2.putText(debug, s, (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2)
	return debug


	def save_binary_debug(work_img, debug_path):
	strategies = get_binary_strategies(work_img)
	panels = []
	tw = 300
	for name, pan in strategies:
	r = tw / pan.shape[1]
	res = cv2.resize(pan, (tw, int(pan.shape[0] * r)))
	cp = cv2.cvtColor(res, cv2.COLOR_GRAY2BGR)
	cv2.putText(cp, name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7,
	(0, 255, 0), 2)
	panels.append(cp)
	mh = max(p.shape[0] for p in panels)
	padded = []
	for p in panels:
	if p.shape[0] < mh:
	p = np.vstack([p, np.zeros((mh - p.shape[0], p.shape[1], 3),
	np.uint8)])
	padded.append(p)
	cv2.imwrite(debug_path.replace("_debug.", "_binary_debug."),
	np.hstack(padded), [cv2.IMWRITE_JPEG_QUALITY, 85])


	def get_rotation_from_gemini(image_bytes: bytes) -> str:
	api_key = os.environ.get("GEMINI_API_KEY")
	if not api_key:
	print("[WARN] GEMINI_API_KEY not set. Defaulting to 90_counterclockwise", flush=True)
	return "90_counterclockwise"

	client = genai.Client(api_key=api_key)
	model = "gemini-3.1-flash-lite-preview"

	contents = [
	types.Content(
	role="user",
	parts=[
	# Defaulting to image/jpeg, handles most cases
	types.Part.from_bytes(mime_type="image/jpeg", data=image_bytes),
	],
	),
	types.Content(
	role="model",
	parts=[
	types.Part.from_text(text="""```json\n{"rotation": "0"}\n```"""),
	],
	),
	types.Content(
	role="user",
	parts=[
	types.Part.from_text(text="""Determine the rotation needed to make this image readable."""),
	],
	),
	]

	generate_content_config = types.GenerateContentConfig(
	system_instruction=[
	types.Part.from_text(text='''you are the AI which detects which orientation the image should be rotated such that the text becomes readable.
	output strict json:
	{"rotation": "90_counterclockwise", "90_clockwise", "180", "0"}'''),
	],
	temperature=0.0
	)

	try:
	response = client.models.generate_content(
	model=model,
	contents=contents,
	config=generate_content_config,
	)
	text = response.text
	if "```json" in text:
	text = text.split("```json")[1].split("```")[0].strip()
	elif "```" in text:
	text = text.split("```")[1].split("```")[0].strip()

	data = json.loads(text)
	return data.get("rotation", "0")
	except Exception as e:
	print(f"[ERROR] Gemini rotation detection failed: {e}", flush=True)
	return "90_counterclockwise"

	def process_image(input_path: str):
	script_dir = os.path.dirname(os.path.abspath(__file__))
	image = cv2.imread(input_path)
	if image is None:
	print(f"[ERROR] Cannot read: {input_path}")
	return

	with open(input_path, "rb") as f:
	image_bytes = f.read()

	rotation_str = get_rotation_from_gemini(image_bytes)
	print(f"[INFO] Gemini detected rotation: {rotation_str}", flush=True)

	if rotation_str == "90_counterclockwise":
	rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
	elif rotation_str == "90_clockwise":
	rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
	elif rotation_str == "180":
	rotated = cv2.rotate(image, cv2.ROTATE_180)
	else:
	rotated = image

	orig_h, orig_w = rotated.shape[:2]

	max_dim = 800.0
	ratio = max(orig_h, orig_w) / max_dim
	work_w = int(orig_w / ratio)
	work_h = int(orig_h / ratio)
	work_img = cv2.resize(rotated, (work_w, work_h))

	corners, all_quads, is_fallback = find_notebook_contour(work_img)
	stem = Path(input_path).stem
	debug_path = os.path.join(script_dir, f"{stem}_debug.jpg")

	if corners is not None:
	corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025)

	scale_x = orig_w / work_w
	scale_y = orig_h / work_h
	corners_orig = corners_exp.copy()
	corners_orig[:, 0] *= scale_x
	corners_orig[:, 1] *= scale_y
	corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1)
	corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1)

	cropped = four_point_transform(rotated, corners_orig)
	print("[INFO] Success! Applied crop.")
	else:
	print("[WARN] Total failure. Returning full rotated image.")
	cropped = rotated

	debug_img = draw_debug_image(work_img, corners, all_quads, is_fallback)
	save_binary_debug(work_img, debug_path)
	cv2.imwrite(debug_path, debug_img, [cv2.IMWRITE_JPEG_QUALITY, 90])

	out_path = os.path.join(script_dir, f"{stem}_cropped.jpg")
	cv2.imwrite(out_path, cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
	print(f"[INFO] Saved cropped: {out_path}")


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	script_dir = os.path.dirname(os.path.abspath(__file__))
	exts = (".jpg", ".jpeg", ".png", ".bmp", ".webp")
	skip = ("_cropped", "_debug", "_binary_debug")
	files = [f for f in os.listdir(script_dir)
	if f.lower().endswith(exts)
	and not any(s in f for s in skip)]
	if not files:
	print("Place images next to script or provide paths.")
	sys.exit(1)
	for fn in sorted(files):
	print(f"\nProcessing: {fn}")
	process_image(os.path.join(script_dir, fn))
	else:
	for p in sys.argv[1:]:
	print(f"\nProcessing: {p}")
	process_image(p)


	def auto_crop_process(image_bytes: bytes) -> bytes:
	"""
	Exact logic from processor.py, but for in-memory bytes.
	1. Decode JPEG/PNG bytes.
	2. Rotate 90 deg CCW.
	3. Detect and crop.
	4. Return JPEG bytes.
	"""
	nparr = np.frombuffer(image_bytes, np.uint8)
	image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
	if image is None:
	return image_bytes

	# 1. Rotate
	rotation_str = get_rotation_from_gemini(image_bytes)
	print(f"[PROCESS] Gemini detected rotation: {rotation_str}", flush=True)

	if rotation_str == "90_counterclockwise":
	rotated = cv2.rotate(image, cv2.ROTATE_90_COUNTERCLOCKWISE)
	elif rotation_str == "90_clockwise":
	rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
	elif rotation_str == "180":
	rotated = cv2.rotate(image, cv2.ROTATE_180)
	else:
	rotated = image
	orig_h, orig_w = rotated.shape[:2]

	# 2. Resize for detection
	max_dim = 800.0
	ratio = max(orig_h, orig_w) / max_dim
	work_w = int(orig_w / ratio)
	work_h = int(orig_h / ratio)
	work_img = cv2.resize(rotated, (work_w, work_h))

	# 3. Find contour
	corners, all_quads, is_fallback = find_notebook_contour(work_img)

	# 4. Transform
	if corners is not None:
	corners_exp = expand_quad(corners, work_img.shape, margin_frac=0.025)

	scale_x = orig_w / work_w
	scale_y = orig_h / work_h
	corners_orig = corners_exp.copy()
	corners_orig[:, 0] *= scale_x
	corners_orig[:, 1] *= scale_y
	corners_orig[:, 0] = np.clip(corners_orig[:, 0], 0, orig_w - 1)
	corners_orig[:, 1] = np.clip(corners_orig[:, 1], 0, orig_h - 1)

	cropped = four_point_transform(rotated, corners_orig)
	else:
	cropped = rotated

	# 5. Encode back to bytes
	_, result_bytes = cv2.imencode('.jpg', cropped, [cv2.IMWRITE_JPEG_QUALITY, 95])
	return result_bytes.tobytes()