Kalhar.Pandya commited on
Commit Β·
05bac69
1
Parent(s): 81ce8e4
final
Browse files
.env
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
|
| 2 |
-
OPENAI_API_KEY=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
|
| 3 |
OPENAI_MODEL=gpt-4o-mini
|
| 4 |
ROW_COUNT=7
|
| 5 |
COL_COUNT=7
|
| 6 |
ZOOM_LEVELS=1
|
| 7 |
OVERLAP_FRAC=0.5
|
| 8 |
PAD_FRAC=0
|
|
|
|
| 9 |
|
|
|
|
| 1 |
|
|
|
|
| 2 |
OPENAI_MODEL=gpt-4o-mini
|
| 3 |
ROW_COUNT=7
|
| 4 |
COL_COUNT=7
|
| 5 |
ZOOM_LEVELS=1
|
| 6 |
OVERLAP_FRAC=0.5
|
| 7 |
PAD_FRAC=0
|
| 8 |
+
MAX_CANDIDATES=3
|
| 9 |
|
app.py
CHANGED
|
@@ -1,129 +1,97 @@
|
|
| 1 |
-
import asyncio, base64, json,
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
import cv2
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from openai import OpenAI
|
| 9 |
|
| 10 |
# βββββββββββ ENV + DEFAULTS βββββββββββ
|
| 11 |
load_dotenv()
|
| 12 |
-
def _env(k, d=""):
|
|
|
|
| 13 |
|
| 14 |
-
# API key and model will be provided through the UI
|
| 15 |
DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
|
| 16 |
|
| 17 |
-
# Available models for dropdown selection - all support vision capabilities
|
| 18 |
AVAILABLE_MODELS = [
|
| 19 |
-
"gpt-4o", # Current flagship model
|
| 20 |
-
"gpt-4o-mini", # More economical version of gpt-4o
|
| 21 |
"o1", # Advanced reasoning model with vision support
|
| 22 |
-
"o1-mini", # Smaller, faster version
|
| 23 |
"o3-mini", # Newest reasoning model (Jan 2025)
|
| 24 |
"gpt-4-vision-preview", # Original vision model (being deprecated)
|
| 25 |
"gpt-4-turbo" # Older model with vision support
|
| 26 |
]
|
| 27 |
|
| 28 |
DEFAULTS = dict(
|
| 29 |
-
row
|
| 30 |
-
col
|
| 31 |
-
zoom
|
| 32 |
-
overlap
|
| 33 |
-
pad
|
|
|
|
| 34 |
)
|
| 35 |
|
|
|
|
| 36 |
DEFAULT_PROMPT = (
|
| 37 |
-
"You are a
|
| 38 |
-
"
|
| 39 |
-
"Your task is to examine the entire image (or each grid cell) and determine whether there is any sign of a human presence. "
|
| 40 |
-
"Partial visibility is acceptableβlook for any visible human features such as limbs, faces, clothing, or distinct shadows and silhouettes that contrast with natural surroundings. "
|
| 41 |
-
"Consider unusual color patterns, shapes, or textures that might indicate a person, even if partially obscured by vegetation or terrain. "
|
| 42 |
-
"Take your time to analyze all clues carefully, and if there is any doubt, mention your top candidate grid cell(s). "
|
| 43 |
"Respond strictly with valid JSON in the following format:\n"
|
| 44 |
-
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
-
"-
|
| 48 |
-
"- MAYBE: Ambiguous or partial human evidence is present.\n"
|
| 49 |
-
"- NO: No evidence of human presence is detected."
|
| 50 |
)
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
# βββββββββββ HELPERS βββββββββββ
|
| 55 |
def encode(img):
|
| 56 |
-
"""Encode image to
|
| 57 |
-
# Set JPEG quality to higher value for better image quality
|
| 58 |
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
|
| 59 |
_, buf = cv2.imencode(".jpg", img, encode_params)
|
| 60 |
return base64.b64encode(buf).decode()
|
| 61 |
|
| 62 |
-
|
| 63 |
-
"""
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
{"type":"text","text":prompt},
|
| 68 |
-
{"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{encode(img)}"}}
|
| 69 |
-
]}]
|
| 70 |
-
delay=1
|
| 71 |
-
for attempt in range(5): # Limit retries to 5
|
| 72 |
-
try:
|
| 73 |
-
r = await asyncio.to_thread(
|
| 74 |
-
client.chat.completions.create,
|
| 75 |
-
model=model, messages=msg, max_tokens=60,
|
| 76 |
-
response_format={"type":"json_object"}
|
| 77 |
-
)
|
| 78 |
-
return json.loads(r.choices[0].message.content)
|
| 79 |
-
except Exception as e:
|
| 80 |
-
if "rate limit" in str(e).lower():
|
| 81 |
-
await asyncio.sleep(delay)
|
| 82 |
-
delay=min(delay*2,32)
|
| 83 |
-
else:
|
| 84 |
-
return {"detected":"NO","confidence":0,"reason":f"Error: {str(e)[:50]}..."}
|
| 85 |
-
# If we get here, we've exhausted all retries
|
| 86 |
-
return {"detected":"NO","confidence":0,"reason":"Too many API retries, please try again later"}
|
| 87 |
|
| 88 |
-
def
|
| 89 |
-
"""
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
| 99 |
for ry in range(ny):
|
| 100 |
for cx in range(nx):
|
| 101 |
-
sx0=min(x0+cx*sx, x0+w-tw)
|
| 102 |
-
sy0=min(y0+ry*sy, y0+h-th)
|
| 103 |
-
|
| 104 |
-
tiles.append((sx0+px,sy0+py,tw-2*px,th-2*py))
|
| 105 |
return tiles
|
| 106 |
|
| 107 |
-
def rank(det):
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
# More distinct colors with better contrast
|
| 111 |
-
STAGE_COLOURS = [(0, 165, 255), # Orange
|
| 112 |
-
(0, 255, 0), # Green
|
| 113 |
-
(255, 0, 0), # Blue (in RGB)
|
| 114 |
-
(255, 255, 0), # Cyan (in RGB)
|
| 115 |
-
(128, 0, 128)] # Purple
|
| 116 |
|
| 117 |
def draw_path(img, path, results=None):
|
| 118 |
-
"""Draw search path on image with
|
| 119 |
out = img.copy()
|
|
|
|
| 120 |
for i, r in enumerate(path):
|
| 121 |
-
x,y,w,h = r
|
| 122 |
-
|
|
|
|
|
|
|
| 123 |
color = STAGE_COLOURS[i % len(STAGE_COLOURS)]
|
| 124 |
-
cv2.rectangle(out, (x1,y1), (x2,y2), color, 2)
|
| 125 |
-
|
| 126 |
-
# Add stage label
|
| 127 |
label = f"S{i+1}"
|
| 128 |
if results and i < len(results):
|
| 129 |
res = results[i]
|
|
@@ -131,265 +99,196 @@ def draw_path(img, path, results=None):
|
|
| 131 |
det = res["detected"]
|
| 132 |
conf = res.get("confidence", 0)
|
| 133 |
label += f": {det} ({conf:.2f})"
|
| 134 |
-
|
| 135 |
-
# Text with background for better visibility
|
| 136 |
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 137 |
font_scale = 0.5
|
| 138 |
thickness = 1
|
| 139 |
text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
cv2.rectangle(out, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), color, -1)
|
| 143 |
-
cv2.putText(out, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
|
| 144 |
-
|
| 145 |
return out
|
| 146 |
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
"""
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
|
|
|
| 152 |
"""
|
| 153 |
if all_results is None:
|
| 154 |
all_results = []
|
| 155 |
|
| 156 |
if depth == 0:
|
| 157 |
-
return region, [],
|
| 158 |
-
|
| 159 |
-
subs = split(region, rows, cols, ov, pad)
|
| 160 |
-
prog(0, desc=f"Stage {depth}: scanning {len(subs)} tiles...")
|
| 161 |
-
|
| 162 |
-
async def task(i, r):
|
| 163 |
-
crop_img = crop(img, r)
|
| 164 |
-
result = await ask_api(crop_img, api_key, model)
|
| 165 |
-
return i, result, r
|
| 166 |
-
|
| 167 |
-
results = [None] * len(subs)
|
| 168 |
-
regions = [None] * len(subs)
|
| 169 |
-
|
| 170 |
-
for c in asyncio.as_completed([task(i, r) for i, r in enumerate(subs)]):
|
| 171 |
-
i, res, r = await c
|
| 172 |
-
results[i] = res
|
| 173 |
-
regions[i] = r
|
| 174 |
-
prog((i+1)/len(subs), desc=f"Stage {depth}: {i+1}/{len(subs)} tiles processed")
|
| 175 |
-
|
| 176 |
-
best_idx, score = None, (3, -1)
|
| 177 |
-
for i, d in enumerate(results):
|
| 178 |
-
s = (rank(d["detected"]), -d["confidence"])
|
| 179 |
-
if s < score:
|
| 180 |
-
best_idx, score = i, s
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
| 193 |
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
# βββββββββββ
|
| 197 |
-
def run_pipeline(pil_img, api_key, model, rows, cols, zoom,
|
| 198 |
-
"""
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
error_message = None
|
| 201 |
-
|
| 202 |
-
# Check if image was provided
|
| 203 |
if pil_img is None:
|
| 204 |
error_message = "Error: Please upload an image to analyze."
|
| 205 |
-
# Check for API key
|
| 206 |
elif not api_key or api_key.strip() == "":
|
| 207 |
-
error_message = "Error: OpenAI API key is required
|
| 208 |
-
# Check if model is selected
|
| 209 |
elif not model or model.strip() == "":
|
| 210 |
error_message = "Error: Please select an OpenAI model."
|
| 211 |
-
|
| 212 |
if error_message:
|
| 213 |
-
return
|
| 214 |
-
|
| 215 |
-
# Input validation
|
| 216 |
try:
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
ov = max(0, min(float(ov), 0.9))
|
| 221 |
-
pad = max(0, min(float(pad), 0.3))
|
| 222 |
-
except (ValueError, TypeError):
|
| 223 |
-
return (None, None, None, "Error: Invalid parameter values. Using defaults instead.")
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
# Create visualization of all tiles in the first stage
|
| 254 |
-
if all_results and len(all_results) > 0:
|
| 255 |
-
first_stage = all_results[0]
|
| 256 |
-
stage_img = img.copy()
|
| 257 |
-
|
| 258 |
-
for i, r in enumerate(first_stage["results"]):
|
| 259 |
-
region = first_stage["region"] if i == first_stage["best_idx"] else None
|
| 260 |
-
x,y,w,h = split(region or (0,0,1,1), rows, cols, ov, pad)[i]
|
| 261 |
-
H,W = img.shape[:2]
|
| 262 |
-
x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
|
| 263 |
-
|
| 264 |
-
# Color based on detection
|
| 265 |
-
if r["detected"] == "YES":
|
| 266 |
-
color = (0, 255, 0) # Green
|
| 267 |
-
elif r["detected"] == "MAYBE":
|
| 268 |
-
color = (0, 165, 255) # Orange
|
| 269 |
-
else:
|
| 270 |
-
color = (255, 0, 0) # Red (in RGB)
|
| 271 |
-
|
| 272 |
-
# Draw rectangle with confidence
|
| 273 |
-
cv2.rectangle(stage_img, (x1,y1), (x2,y2), color, 1)
|
| 274 |
-
conf = r.get("confidence", 0)
|
| 275 |
-
|
| 276 |
-
# Add text with background
|
| 277 |
-
label = f"{r['detected']} ({conf:.2f})"
|
| 278 |
-
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 279 |
-
font_scale = 0.4
|
| 280 |
-
thickness = 1
|
| 281 |
-
text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
|
| 282 |
-
|
| 283 |
-
# Draw background for text
|
| 284 |
-
cv2.rectangle(stage_img, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), (0,0,0), -1)
|
| 285 |
-
cv2.putText(stage_img, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
|
| 286 |
-
|
| 287 |
-
# Mark best tile with thicker border
|
| 288 |
-
best_idx = first_stage["best_idx"]
|
| 289 |
-
if best_idx is not None and best_idx < len(split((0,0,1,1), rows, cols, ov, pad)):
|
| 290 |
-
r = split((0,0,1,1), rows, cols, ov, pad)[best_idx]
|
| 291 |
-
x,y,w,h = r
|
| 292 |
-
H,W = img.shape[:2]
|
| 293 |
-
x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
|
| 294 |
-
cv2.rectangle(stage_img, (x1,y1), (x2,y2), (0,255,255), 3) # Yellow thick border
|
| 295 |
-
else:
|
| 296 |
-
stage_img = img.copy()
|
| 297 |
-
|
| 298 |
-
# Create a summary of the results
|
| 299 |
-
summary = []
|
| 300 |
-
for i, res in enumerate(best_results):
|
| 301 |
-
summary.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
|
| 302 |
-
|
| 303 |
-
summary_text = "\n".join(summary)
|
| 304 |
-
|
| 305 |
-
# Return results
|
| 306 |
-
return crop_img, path_img, stage_img, summary_text
|
| 307 |
-
except Exception as e:
|
| 308 |
-
# Handle any other exceptions
|
| 309 |
-
return (None, None, None, f"Error: {str(e)}")
|
| 310 |
|
| 311 |
-
# βββββββββββ UI βββββββββββ
|
| 312 |
-
with gr.Blocks(title="EagleβEyes Search", css="footer {visibility: hidden}") as demo:
|
| 313 |
gr.Markdown("""
|
| 314 |
-
# π¦
Eagle
|
| 315 |
|
| 316 |
-
Upload
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
How it works:
|
| 320 |
-
1. The image is divided into a grid based on your settings
|
| 321 |
-
2. Each grid cell is analyzed for human presence
|
| 322 |
-
3. The most promising cell is selected for the next zoom level
|
| 323 |
-
4. This process repeats for the specified number of zoom levels
|
| 324 |
""")
|
| 325 |
|
| 326 |
with gr.Row():
|
| 327 |
with gr.Column(scale=1):
|
| 328 |
img_in = gr.Image(type="pil", label="Input Image")
|
| 329 |
-
|
| 330 |
-
# API Key input (password field)
|
| 331 |
api_key = gr.Textbox(
|
| 332 |
label="OpenAI API Key",
|
| 333 |
placeholder="Enter your OpenAI API key here...",
|
| 334 |
type="password",
|
| 335 |
info="Your API key will be used only for this session and not stored"
|
| 336 |
)
|
| 337 |
-
|
| 338 |
-
# Model selection dropdown
|
| 339 |
model = gr.Dropdown(
|
| 340 |
choices=AVAILABLE_MODELS,
|
| 341 |
value=DEFAULT_MODEL,
|
| 342 |
label="Model Selection",
|
| 343 |
info="Select the OpenAI model to use for analysis"
|
| 344 |
)
|
| 345 |
-
|
| 346 |
with gr.Group():
|
| 347 |
with gr.Row():
|
| 348 |
-
row
|
| 349 |
-
col
|
| 350 |
-
zoom = gr.Number(value=DEFAULTS["zoom"], label="Zoom Levels", precision=0, minimum=1, maximum=
|
| 351 |
-
|
| 352 |
with gr.Row():
|
| 353 |
-
|
| 354 |
-
pad = gr.Slider(0, 0.3, step=0.01, value=DEFAULTS["pad"], label="Tile Padding")
|
| 355 |
-
|
| 356 |
btn = gr.Button("π Run Search", variant="primary")
|
| 357 |
-
|
| 358 |
summary_out = gr.Textbox(label="Results Summary", lines=5, interactive=False)
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
with gr.Tab("Search Path"):
|
| 365 |
-
path_out = gr.Image(label="Search Path (Colored by Zoom Level)")
|
| 366 |
-
|
| 367 |
-
with gr.Tab("First Stage Analysis"):
|
| 368 |
-
stage_out = gr.Image(label="First Stage Grid Analysis")
|
| 369 |
-
|
| 370 |
gr.Markdown("""
|
| 371 |
### Tips for Best Results
|
| 372 |
|
| 373 |
-
- **OpenAI API Key**: Required
|
| 374 |
-
- **Model Selection**: Choose
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
- `o1-mini`: Smaller, faster version of o1
|
| 379 |
-
- `o3-mini`: Newest reasoning model (Jan 2025), optimized for STEM tasks
|
| 380 |
-
- `gpt-4-vision-preview`: Original vision model (being deprecated)
|
| 381 |
-
- `gpt-4-turbo`: Older model with vision capabilities
|
| 382 |
-
- **Grid Size**: More rows/columns give better precision but require more API calls
|
| 383 |
-
- **Zoom Levels**: More levels allow deeper searching in complex images
|
| 384 |
-
- **Overlap**: Higher overlap prevents missing objects at tile boundaries
|
| 385 |
-
- **Padding**: Reduces edge artifacts in grid cells
|
| 386 |
-
|
| 387 |
-
This tool uses OpenAI's vision API to analyze image regions and detect human presence.
|
| 388 |
""")
|
| 389 |
-
|
| 390 |
-
btn.click(
|
| 391 |
-
|
| 392 |
-
|
|
|
|
|
|
|
| 393 |
|
| 394 |
if __name__ == "__main__":
|
| 395 |
-
demo.launch()
|
|
|
|
| 1 |
+
import asyncio, base64, json, os, tempfile
|
| 2 |
from pathlib import Path
|
| 3 |
|
| 4 |
import cv2
|
| 5 |
import gradio as gr
|
| 6 |
import numpy as np
|
| 7 |
+
from PIL import Image
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from openai import OpenAI
|
| 10 |
|
| 11 |
# βββββββββββ ENV + DEFAULTS βββββββββββ
|
| 12 |
load_dotenv()
|
| 13 |
+
def _env(k, d=""):
|
| 14 |
+
return os.getenv(k, d).split("#", 1)[0].strip()
|
| 15 |
|
|
|
|
| 16 |
DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
|
| 17 |
|
|
|
|
| 18 |
AVAILABLE_MODELS = [
|
| 19 |
+
"gpt-4o", # Current flagship model with vision support
|
| 20 |
+
"gpt-4o-mini", # More economical version of gpt-4o with vision support
|
| 21 |
"o1", # Advanced reasoning model with vision support
|
| 22 |
+
"o1-mini", # Smaller, faster version (if needed)
|
| 23 |
"o3-mini", # Newest reasoning model (Jan 2025)
|
| 24 |
"gpt-4-vision-preview", # Original vision model (being deprecated)
|
| 25 |
"gpt-4-turbo" # Older model with vision support
|
| 26 |
]
|
| 27 |
|
| 28 |
DEFAULTS = dict(
|
| 29 |
+
row = int(_env("ROW_COUNT", 7)),
|
| 30 |
+
col = int(_env("COL_COUNT", 7)),
|
| 31 |
+
zoom = int(_env("ZOOM_LEVELS", 2)), # Recursion depth (zoom levels)
|
| 32 |
+
overlap = 0.0, # Fixed at 0 as requested
|
| 33 |
+
pad = 0.0, # Fixed at 0 as requested
|
| 34 |
+
max_candidates = int(_env("MAX_CANDIDATES", 3)) # Maximum number of candidates per search
|
| 35 |
)
|
| 36 |
|
| 37 |
+
# βββββββββββ PROMPT FOR GRID CELL ANALYSIS βββββββββββ
|
| 38 |
DEFAULT_PROMPT = (
|
| 39 |
+
"You are a vision inspector. Look at the image and determine if a human is present. "
|
| 40 |
+
"Partial visibility is acceptableβconsider clues like limbs, clothing, silhouettes, shadows, or partial faces. "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"Respond strictly with valid JSON in the following format:\n"
|
| 42 |
+
'{"detected":"YES/NO/MAYBE", "confidence":<float between 0 and 1>, "reason":"<15 words max>"}\n'
|
| 43 |
+
"- YES: Clearly visible human feature(s) are observed.\n"
|
| 44 |
+
"- MAYBE: Ambiguous or partial evidence is present.\n"
|
| 45 |
+
"- NO: No evidence of a human is detected."
|
|
|
|
|
|
|
| 46 |
)
|
| 47 |
|
|
|
|
|
|
|
| 48 |
# βββββββββββ HELPERS βββββββββββ
|
| 49 |
def encode(img):
|
| 50 |
+
"""Encode image to a Base64 string."""
|
|
|
|
| 51 |
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
|
| 52 |
_, buf = cv2.imencode(".jpg", img, encode_params)
|
| 53 |
return base64.b64encode(buf).decode()
|
| 54 |
|
| 55 |
+
def crop(img, r):
|
| 56 |
+
"""Crop image to region r=(x,y,w,h) in relative coordinates."""
|
| 57 |
+
H, W = img.shape[:2]
|
| 58 |
+
x, y, w, h = r
|
| 59 |
+
return img[int(y * H):int((y + h) * H), int(x * W):int((x + w) * W)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
def split(r, rows, cols, ov=0.0, pad=0.0):
|
| 62 |
+
"""
|
| 63 |
+
Split region r=(x,y,w,h) into a grid of subregions with specified rows and columns.
|
| 64 |
+
Overlap and padding are fixed at 0 as configured.
|
| 65 |
+
"""
|
| 66 |
+
x0, y0, w, h = r
|
| 67 |
+
tw, th = w / cols, h / rows
|
| 68 |
+
sx, sy = tw, th # no overlap since ov=0.0
|
| 69 |
+
tiles = []
|
| 70 |
+
# Calculate number of grid cells
|
| 71 |
+
nx = max(1, int((w - tw) // sx) + 1)
|
| 72 |
+
ny = max(1, int((h - th) // sy) + 1)
|
| 73 |
for ry in range(ny):
|
| 74 |
for cx in range(nx):
|
| 75 |
+
sx0 = min(x0 + cx * sx, x0 + w - tw)
|
| 76 |
+
sy0 = min(y0 + ry * sy, y0 + h - th)
|
| 77 |
+
tiles.append((sx0, sy0, tw, th))
|
|
|
|
| 78 |
return tiles
|
| 79 |
|
| 80 |
+
def rank(det):
|
| 81 |
+
"""Rank the detection result."""
|
| 82 |
+
return {"YES": 0, "MAYBE": 1}.get(det, 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
def draw_path(img, path, results=None):
|
| 85 |
+
"""Draw the search path on the image with rectangles for each stage."""
|
| 86 |
out = img.copy()
|
| 87 |
+
STAGE_COLOURS = [(0, 165, 255), (0, 255, 0), (255, 0, 0), (255, 255, 0), (128, 0, 128)]
|
| 88 |
for i, r in enumerate(path):
|
| 89 |
+
x, y, w, h = r
|
| 90 |
+
H, W = img.shape[:2]
|
| 91 |
+
x1, y1 = int(x * W), int(y * H)
|
| 92 |
+
x2, y2 = int((x + w) * W), int((y + h) * H)
|
| 93 |
color = STAGE_COLOURS[i % len(STAGE_COLOURS)]
|
| 94 |
+
cv2.rectangle(out, (x1, y1), (x2, y2), color, 2)
|
|
|
|
|
|
|
| 95 |
label = f"S{i+1}"
|
| 96 |
if results and i < len(results):
|
| 97 |
res = results[i]
|
|
|
|
| 99 |
det = res["detected"]
|
| 100 |
conf = res.get("confidence", 0)
|
| 101 |
label += f": {det} ({conf:.2f})"
|
|
|
|
|
|
|
| 102 |
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 103 |
font_scale = 0.5
|
| 104 |
thickness = 1
|
| 105 |
text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
|
| 106 |
+
cv2.rectangle(out, (x1, y1 - text_size[1] - 5), (x1 + text_size[0] + 5, y1), color, -1)
|
| 107 |
+
cv2.putText(out, label, (x1 + 2, y1 - 5), font, font_scale, (255, 255, 255), thickness)
|
|
|
|
|
|
|
|
|
|
| 108 |
return out
|
| 109 |
|
| 110 |
+
# βββββββββββ API CALL FOR A SINGLE GRID CELL βββββββββββ
|
| 111 |
+
async def ask_api(img, api_key, model="gpt-4o", custom_prompt=None):
|
| 112 |
+
"""Send one grid cell image to the OpenAI API and return the result."""
|
| 113 |
+
client = OpenAI(api_key=api_key)
|
| 114 |
+
prompt = custom_prompt or DEFAULT_PROMPT
|
| 115 |
+
msg = [{
|
| 116 |
+
"role": "user",
|
| 117 |
+
"content": [
|
| 118 |
+
{"type": "text", "text": prompt},
|
| 119 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode(img)}"}}
|
| 120 |
+
]
|
| 121 |
+
}]
|
| 122 |
+
delay = 1
|
| 123 |
+
for attempt in range(5):
|
| 124 |
+
try:
|
| 125 |
+
response = await asyncio.to_thread(
|
| 126 |
+
client.chat.completions.create,
|
| 127 |
+
model=model,
|
| 128 |
+
messages=msg,
|
| 129 |
+
max_tokens=60,
|
| 130 |
+
response_format={"type": "json_object"}
|
| 131 |
+
)
|
| 132 |
+
return json.loads(response.choices[0].message.content)
|
| 133 |
+
except Exception as e:
|
| 134 |
+
if "rate limit" in str(e).lower():
|
| 135 |
+
await asyncio.sleep(delay)
|
| 136 |
+
delay = min(delay * 2, 32)
|
| 137 |
+
else:
|
| 138 |
+
return {"detected": "NO", "confidence": 0, "reason": f"Error: {str(e)[:50]}..."}
|
| 139 |
+
return {"detected": "NO", "confidence": 0, "reason": "Too many API retries, please try again later"}
|
| 140 |
+
|
| 141 |
+
# βββββββββββ RECURSIVE SEARCH FUNCTION (MULTI-CANDIDATE) βββββββββββ
|
| 142 |
+
async def recurse_multi(img, region, depth, rows, cols, prog, api_key, model, max_candidates, all_results=None):
|
| 143 |
"""
|
| 144 |
+
Recursively analyze grid cells, allowing up to max_candidates per stage.
|
| 145 |
+
Returns a list of branch dictionaries with keys:
|
| 146 |
+
- "final_region": final region in the branch,
|
| 147 |
+
- "path": list of regions (from higher to lower levels),
|
| 148 |
+
- "stage_results": list of API results per stage.
|
| 149 |
"""
|
| 150 |
if all_results is None:
|
| 151 |
all_results = []
|
| 152 |
|
| 153 |
if depth == 0:
|
| 154 |
+
return [{"final_region": region, "path": [], "stage_results": []}]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
+
subs = split(region, rows, cols, ov=0.0, pad=0.0)
|
| 157 |
+
prog(0, desc=f"Stage {depth}: scanning {len(subs)} grid cells...")
|
| 158 |
+
tasks = []
|
| 159 |
+
for sub in subs:
|
| 160 |
+
crop_img = crop(img, sub)
|
| 161 |
+
tasks.append(ask_api(crop_img, api_key, model))
|
| 162 |
+
results = await asyncio.gather(*tasks)
|
| 163 |
|
| 164 |
+
# Pair each subregion with its result
|
| 165 |
+
sub_results = list(zip(subs, results))
|
| 166 |
+
# Sort by (rank, -confidence)
|
| 167 |
+
sub_results.sort(key=lambda tup: (rank(tup[1]["detected"]), -tup[1].get("confidence", 0)))
|
| 168 |
|
| 169 |
+
# Select candidates with positive detection ("YES" or "MAYBE"); if none, take best candidate
|
| 170 |
+
candidates = [tup for tup in sub_results if tup[1]["detected"] in ("YES", "MAYBE")]
|
| 171 |
+
if not candidates:
|
| 172 |
+
candidates = [sub_results[0]]
|
| 173 |
+
candidates = candidates[:max_candidates]
|
| 174 |
|
| 175 |
+
branches = []
|
| 176 |
+
for candidate_region, candidate_result in candidates:
|
| 177 |
+
# For current candidate, record its stage result
|
| 178 |
+
current_stage = {"region": candidate_region, "result": candidate_result}
|
| 179 |
+
# Recursively search within candidate region
|
| 180 |
+
child_branches = await recurse_multi(img, candidate_region, depth - 1, rows, cols, prog, api_key, model, max_candidates)
|
| 181 |
+
for branch in child_branches:
|
| 182 |
+
branch["path"].insert(0, candidate_region)
|
| 183 |
+
branch["stage_results"].insert(0, candidate_result)
|
| 184 |
+
branches.append(branch)
|
| 185 |
+
return branches
|
| 186 |
|
| 187 |
+
# βββββββββββ PIPELINE FUNCTION βββββββββββ
|
| 188 |
+
def run_pipeline(pil_img, api_key, model, rows, cols, zoom, max_candidates, progress=gr.Progress()):
|
| 189 |
+
"""
|
| 190 |
+
Process a single uploaded image:
|
| 191 |
+
1. Divide the image into grid cells.
|
| 192 |
+
2. Recursively zoom in by exploring up to max_candidates per stage.
|
| 193 |
+
3. Draw the search path on the original image.
|
| 194 |
+
4. Return the final cropped region (from the best branch), its search path, and a summary.
|
| 195 |
+
"""
|
| 196 |
error_message = None
|
|
|
|
|
|
|
| 197 |
if pil_img is None:
|
| 198 |
error_message = "Error: Please upload an image to analyze."
|
|
|
|
| 199 |
elif not api_key or api_key.strip() == "":
|
| 200 |
+
error_message = "Error: OpenAI API key is required."
|
|
|
|
| 201 |
elif not model or model.strip() == "":
|
| 202 |
error_message = "Error: Please select an OpenAI model."
|
|
|
|
| 203 |
if error_message:
|
| 204 |
+
return None, None, error_message
|
| 205 |
+
|
|
|
|
| 206 |
try:
|
| 207 |
+
img_np = np.array(pil_img)
|
| 208 |
+
except Exception as e:
|
| 209 |
+
return None, None, f"Error converting image: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
full_region = (0, 0, 1, 1)
|
| 212 |
+
progress(0, desc=f"Starting recursive grid search using {model}...")
|
| 213 |
+
branches = asyncio.run(
|
| 214 |
+
recurse_multi(img_np, full_region, zoom, rows, cols, progress, api_key, model, max_candidates)
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
if not branches:
|
| 218 |
+
return None, None, "No branch found."
|
| 219 |
+
|
| 220 |
+
# Select the branch with the highest confidence in its most zoomed-in stage
|
| 221 |
+
best_branch = max(branches, key=lambda b: b["stage_results"][0].get("confidence", 0))
|
| 222 |
+
final_reg = best_branch["final_region"] if "final_region" in best_branch else best_branch["path"][0]
|
| 223 |
+
final_crop = crop(img_np, final_reg)
|
| 224 |
+
final_crop_pil = Image.fromarray(final_crop)
|
| 225 |
+
|
| 226 |
+
# Draw search path using the branch's path and stage_results (reverse to show top-level first)
|
| 227 |
+
path_order = list(reversed(best_branch["path"]))
|
| 228 |
+
stage_results_order = list(reversed(best_branch["stage_results"]))
|
| 229 |
+
path_img = draw_path(img_np, path_order, stage_results_order)
|
| 230 |
+
path_img_pil = Image.fromarray(path_img)
|
| 231 |
+
|
| 232 |
+
# Build summary text for the branch
|
| 233 |
+
summary_lines = []
|
| 234 |
+
for i, res in enumerate(stage_results_order):
|
| 235 |
+
summary_lines.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
|
| 236 |
+
summary_text = "\n".join(summary_lines)
|
| 237 |
+
|
| 238 |
+
return final_crop_pil, path_img_pil, summary_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
+
# βββββββββββ GRADIO UI βββββββββββ
|
| 241 |
+
with gr.Blocks(title="EagleβEyes Recursive Grid Search", css="footer {visibility: hidden}") as demo:
|
| 242 |
gr.Markdown("""
|
| 243 |
+
# π¦
EagleβEyes Recursive Grid Search
|
| 244 |
|
| 245 |
+
Upload a single image. The tool will divide the image into grid cells and recursively zoom in on the most promising region.
|
| 246 |
+
At each stage, up to a configurable number of positive candidates are explored. The search path is drawn on the original image.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
""")
|
| 248 |
|
| 249 |
with gr.Row():
|
| 250 |
with gr.Column(scale=1):
|
| 251 |
img_in = gr.Image(type="pil", label="Input Image")
|
|
|
|
|
|
|
| 252 |
api_key = gr.Textbox(
|
| 253 |
label="OpenAI API Key",
|
| 254 |
placeholder="Enter your OpenAI API key here...",
|
| 255 |
type="password",
|
| 256 |
info="Your API key will be used only for this session and not stored"
|
| 257 |
)
|
|
|
|
|
|
|
| 258 |
model = gr.Dropdown(
|
| 259 |
choices=AVAILABLE_MODELS,
|
| 260 |
value=DEFAULT_MODEL,
|
| 261 |
label="Model Selection",
|
| 262 |
info="Select the OpenAI model to use for analysis"
|
| 263 |
)
|
|
|
|
| 264 |
with gr.Group():
|
| 265 |
with gr.Row():
|
| 266 |
+
row = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
|
| 267 |
+
col = gr.Number(value=DEFAULTS["col"], label="Grid Columns", precision=0, minimum=1, maximum=10)
|
| 268 |
+
zoom = gr.Number(value=DEFAULTS["zoom"], label="Zoom Levels", precision=0, minimum=1, maximum=5)
|
|
|
|
| 269 |
with gr.Row():
|
| 270 |
+
max_candidates = gr.Number(value=DEFAULTS["max_candidates"], label="Max Candidates per Stage", precision=0, minimum=1, maximum=10)
|
|
|
|
|
|
|
| 271 |
btn = gr.Button("π Run Search", variant="primary")
|
|
|
|
| 272 |
summary_out = gr.Textbox(label="Results Summary", lines=5, interactive=False)
|
| 273 |
+
with gr.Column(scale=1):
|
| 274 |
+
crop_out = gr.Image(label="Final Crop (Zoomed Region)")
|
| 275 |
+
path_out = gr.Image(label="Search Path Visualization")
|
| 276 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
gr.Markdown("""
|
| 278 |
### Tips for Best Results
|
| 279 |
|
| 280 |
+
- **OpenAI API Key**: Required for this tool. Your key remains private.
|
| 281 |
+
- **Model Selection**: Choose a model with vision support (e.g., `gpt-4o`, `gpt-4o-mini`, `o1`, etc.).
|
| 282 |
+
- **Grid Settings**: Adjust rows and columns to fine-tune segmentation.
|
| 283 |
+
- **Zoom Levels**: More zoom levels perform deeper recursive search.
|
| 284 |
+
- **Max Candidates per Stage**: Controls how many positive grid cells to explore at each stage.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
""")
|
| 286 |
+
|
| 287 |
+
btn.click(
|
| 288 |
+
run_pipeline,
|
| 289 |
+
inputs=[img_in, api_key, model, row, col, zoom, max_candidates],
|
| 290 |
+
outputs=[crop_out, path_out, summary_out]
|
| 291 |
+
)
|
| 292 |
|
| 293 |
if __name__ == "__main__":
|
| 294 |
+
demo.launch()
|