Kalhar.Pandya commited on
Commit
05bac69
Β·
1 Parent(s): 81ce8e4
Files changed (2) hide show
  1. .env +1 -1
  2. app.py +187 -288
.env CHANGED
@@ -1,9 +1,9 @@
1
 
2
- OPENAI_API_KEY=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
3
  OPENAI_MODEL=gpt-4o-mini
4
  ROW_COUNT=7
5
  COL_COUNT=7
6
  ZOOM_LEVELS=1
7
  OVERLAP_FRAC=0.5
8
  PAD_FRAC=0
 
9
 
 
1
 
 
2
  OPENAI_MODEL=gpt-4o-mini
3
  ROW_COUNT=7
4
  COL_COUNT=7
5
  ZOOM_LEVELS=1
6
  OVERLAP_FRAC=0.5
7
  PAD_FRAC=0
8
+ MAX_CANDIDATES=3
9
 
app.py CHANGED
@@ -1,129 +1,97 @@
1
- import asyncio, base64, json, math, os, tempfile
2
  from pathlib import Path
3
 
4
  import cv2
5
  import gradio as gr
6
  import numpy as np
 
7
  from dotenv import load_dotenv
8
  from openai import OpenAI
9
 
10
  # ─────────── ENV + DEFAULTS ───────────
11
  load_dotenv()
12
- def _env(k, d=""): return os.getenv(k, d).split("#", 1)[0].strip()
 
13
 
14
- # API key and model will be provided through the UI
15
  DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
16
 
17
- # Available models for dropdown selection - all support vision capabilities
18
  AVAILABLE_MODELS = [
19
- "gpt-4o", # Current flagship model (most recommended)
20
- "gpt-4o-mini", # More economical version of gpt-4o
21
  "o1", # Advanced reasoning model with vision support
22
- "o1-mini", # Smaller, faster version of o1
23
  "o3-mini", # Newest reasoning model (Jan 2025)
24
  "gpt-4-vision-preview", # Original vision model (being deprecated)
25
  "gpt-4-turbo" # Older model with vision support
26
  ]
27
 
28
  DEFAULTS = dict(
29
- row = int(_env("ROW_COUNT", 7)),
30
- col = int(_env("COL_COUNT", 7)),
31
- zoom = int(_env("ZOOM_LEVELS", 1)),
32
- overlap = float(_env("OVERLAP_FRAC", 0.5)),
33
- pad = float(_env("PAD_FRAC", 0.0)),
 
34
  )
35
 
 
36
  DEFAULT_PROMPT = (
37
- "You are a highly detailed vision inspector specialized in human detection from aerial imagery. "
38
- "You are provided with an image that may be divided into grid cells, each labeled with a unique number. "
39
- "Your task is to examine the entire image (or each grid cell) and determine whether there is any sign of a human presence. "
40
- "Partial visibility is acceptableβ€”look for any visible human features such as limbs, faces, clothing, or distinct shadows and silhouettes that contrast with natural surroundings. "
41
- "Consider unusual color patterns, shapes, or textures that might indicate a person, even if partially obscured by vegetation or terrain. "
42
- "Take your time to analyze all clues carefully, and if there is any doubt, mention your top candidate grid cell(s). "
43
  "Respond strictly with valid JSON in the following format:\n"
44
- " {\"detected\":\"YES/NO/MAYBE\", \"confidence\":<float between 0 and 1>, \"reason\":\"<15 words max>\"}\n"
45
- "For example, if a grid cell shows a clear human silhouette with contrasting clothing, your response might be:\n"
46
- " {\"detected\":\"YES\", \"confidence\":0.87, \"reason\":\"Clear human figure in grid cell 23 with distinct clothing and shadow.\"}\n"
47
- "- YES: A human or clear human-like feature is observed.\n"
48
- "- MAYBE: Ambiguous or partial human evidence is present.\n"
49
- "- NO: No evidence of human presence is detected."
50
  )
51
 
52
-
53
-
54
  # ─────────── HELPERS ───────────
55
  def encode(img):
56
- """Encode image to base64 string"""
57
- # Set JPEG quality to higher value for better image quality
58
  encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
59
  _, buf = cv2.imencode(".jpg", img, encode_params)
60
  return base64.b64encode(buf).decode()
61
 
62
- async def ask_api(img, api_key, model="gpt-4o", custom_prompt=None):
63
- """Ask OpenAI API about an image"""
64
- client = OpenAI(api_key=api_key)
65
- prompt = custom_prompt or DEFAULT_PROMPT
66
- msg=[{"role":"user","content":[
67
- {"type":"text","text":prompt},
68
- {"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{encode(img)}"}}
69
- ]}]
70
- delay=1
71
- for attempt in range(5): # Limit retries to 5
72
- try:
73
- r = await asyncio.to_thread(
74
- client.chat.completions.create,
75
- model=model, messages=msg, max_tokens=60,
76
- response_format={"type":"json_object"}
77
- )
78
- return json.loads(r.choices[0].message.content)
79
- except Exception as e:
80
- if "rate limit" in str(e).lower():
81
- await asyncio.sleep(delay)
82
- delay=min(delay*2,32)
83
- else:
84
- return {"detected":"NO","confidence":0,"reason":f"Error: {str(e)[:50]}..."}
85
- # If we get here, we've exhausted all retries
86
- return {"detected":"NO","confidence":0,"reason":"Too many API retries, please try again later"}
87
 
88
- def crop(img,r):
89
- """Crop image to region r=(x,y,w,h) in relative coordinates"""
90
- H,W=img.shape[:2]; x,y,w,h=r
91
- return img[int(y*H):int((y+h)*H), int(x*W):int((x+w)*W)]
92
-
93
- def split(r, rows, cols, ov, pad):
94
- """Split region r into a grid of subregions with overlap and padding"""
95
- x0,y0,w,h=r; tw,th=w/cols,h/rows
96
- sx,sy=tw*(1-ov),th*(1-ov)
97
- nx=max(1,int((w-tw)//sx)+1); ny=max(1,int((h-th)//sy)+1)
98
- tiles=[]
 
99
  for ry in range(ny):
100
  for cx in range(nx):
101
- sx0=min(x0+cx*sx, x0+w-tw)
102
- sy0=min(y0+ry*sy, y0+h-th)
103
- px,py=tw*pad,th*pad
104
- tiles.append((sx0+px,sy0+py,tw-2*px,th-2*py))
105
  return tiles
106
 
107
- def rank(det): return {"YES":0,"MAYBE":1}.get(det,2)
108
-
109
- # ─────────── RECURSIVE SEARCH ───────────
110
- # More distinct colors with better contrast
111
- STAGE_COLOURS = [(0, 165, 255), # Orange
112
- (0, 255, 0), # Green
113
- (255, 0, 0), # Blue (in RGB)
114
- (255, 255, 0), # Cyan (in RGB)
115
- (128, 0, 128)] # Purple
116
 
117
  def draw_path(img, path, results=None):
118
- """Draw search path on image with optional detection results"""
119
  out = img.copy()
 
120
  for i, r in enumerate(path):
121
- x,y,w,h = r; H,W = img.shape[:2]
122
- x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
 
 
123
  color = STAGE_COLOURS[i % len(STAGE_COLOURS)]
124
- cv2.rectangle(out, (x1,y1), (x2,y2), color, 2)
125
-
126
- # Add stage label
127
  label = f"S{i+1}"
128
  if results and i < len(results):
129
  res = results[i]
@@ -131,265 +99,196 @@ def draw_path(img, path, results=None):
131
  det = res["detected"]
132
  conf = res.get("confidence", 0)
133
  label += f": {det} ({conf:.2f})"
134
-
135
- # Text with background for better visibility
136
  font = cv2.FONT_HERSHEY_SIMPLEX
137
  font_scale = 0.5
138
  thickness = 1
139
  text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
140
-
141
- # Draw background rectangle for text
142
- cv2.rectangle(out, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), color, -1)
143
- cv2.putText(out, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
144
-
145
  return out
146
 
147
- async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, model, all_results=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  """
149
- Returns (final_region, path, results) where:
150
- - path is a list of chosen regions, one per stage
151
- - results is a list of API results for each stage
 
 
152
  """
153
  if all_results is None:
154
  all_results = []
155
 
156
  if depth == 0:
157
- return region, [], all_results
158
-
159
- subs = split(region, rows, cols, ov, pad)
160
- prog(0, desc=f"Stage {depth}: scanning {len(subs)} tiles...")
161
-
162
- async def task(i, r):
163
- crop_img = crop(img, r)
164
- result = await ask_api(crop_img, api_key, model)
165
- return i, result, r
166
-
167
- results = [None] * len(subs)
168
- regions = [None] * len(subs)
169
-
170
- for c in asyncio.as_completed([task(i, r) for i, r in enumerate(subs)]):
171
- i, res, r = await c
172
- results[i] = res
173
- regions[i] = r
174
- prog((i+1)/len(subs), desc=f"Stage {depth}: {i+1}/{len(subs)} tiles processed")
175
-
176
- best_idx, score = None, (3, -1)
177
- for i, d in enumerate(results):
178
- s = (rank(d["detected"]), -d["confidence"])
179
- if s < score:
180
- best_idx, score = i, s
181
 
182
- if best_idx is None:
183
- best_idx = 0
 
 
 
 
 
184
 
185
- best_region = regions[best_idx]
186
- stage_results = {"region": best_region, "results": results, "best_idx": best_idx}
187
- all_results.append(stage_results)
 
188
 
189
- final_reg, sub_path, all_res = await recurse(
190
- img, best_region, depth - 1, rows, cols, ov, pad, prog, api_key, model, all_results
191
- )
192
-
 
193
 
194
- return final_reg, [best_region] + sub_path, all_res
 
 
 
 
 
 
 
 
 
 
195
 
196
- # ─────────── GRADIO PIPELINE ───────────
197
- def run_pipeline(pil_img, api_key, model, rows, cols, zoom, ov, pad, progress=gr.Progress()):
198
- """Main pipeline to process an image and find humans"""
199
- # Input validation and error checking
 
 
 
 
 
200
  error_message = None
201
-
202
- # Check if image was provided
203
  if pil_img is None:
204
  error_message = "Error: Please upload an image to analyze."
205
- # Check for API key
206
  elif not api_key or api_key.strip() == "":
207
- error_message = "Error: OpenAI API key is required to run the search."
208
- # Check if model is selected
209
  elif not model or model.strip() == "":
210
  error_message = "Error: Please select an OpenAI model."
211
-
212
  if error_message:
213
- return (None, None, None, error_message)
214
-
215
- # Input validation
216
  try:
217
- rows = max(1, min(int(rows), 10))
218
- cols = max(1, min(int(cols), 10))
219
- zoom = max(1, min(int(zoom), 3))
220
- ov = max(0, min(float(ov), 0.9))
221
- pad = max(0, min(float(pad), 0.3))
222
- except (ValueError, TypeError):
223
- return (None, None, None, "Error: Invalid parameter values. Using defaults instead.")
224
 
225
- with tempfile.TemporaryDirectory() as td:
226
- try:
227
- img_path = str(Path(td) / "in.jpg")
228
- pil_img.save(img_path)
229
-
230
- # Convert to RGB after reading (OpenCV reads as BGR)
231
- img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
232
-
233
- progress(0, desc=f"Starting recursive search using {model}...")
234
- final_reg, path, all_results = asyncio.run(
235
- recurse(img, (0, 0, 1, 1), zoom, rows, cols, ov, pad, progress, api_key, model)
236
- )
237
-
238
- # Get final crop (already in RGB)
239
- crop_img = crop(img, final_reg)
240
-
241
- # Collect results for each stage
242
- stage_results = []
243
- best_results = []
244
- for stage in all_results:
245
- best_idx = stage["best_idx"]
246
- results = stage["results"]
247
- if best_idx is not None and best_idx < len(results):
248
- best_results.append(results[best_idx])
249
-
250
- # Create the path visualization (already in RGB)
251
- path_img = draw_path(img, path, best_results)
252
-
253
- # Create visualization of all tiles in the first stage
254
- if all_results and len(all_results) > 0:
255
- first_stage = all_results[0]
256
- stage_img = img.copy()
257
-
258
- for i, r in enumerate(first_stage["results"]):
259
- region = first_stage["region"] if i == first_stage["best_idx"] else None
260
- x,y,w,h = split(region or (0,0,1,1), rows, cols, ov, pad)[i]
261
- H,W = img.shape[:2]
262
- x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
263
-
264
- # Color based on detection
265
- if r["detected"] == "YES":
266
- color = (0, 255, 0) # Green
267
- elif r["detected"] == "MAYBE":
268
- color = (0, 165, 255) # Orange
269
- else:
270
- color = (255, 0, 0) # Red (in RGB)
271
-
272
- # Draw rectangle with confidence
273
- cv2.rectangle(stage_img, (x1,y1), (x2,y2), color, 1)
274
- conf = r.get("confidence", 0)
275
-
276
- # Add text with background
277
- label = f"{r['detected']} ({conf:.2f})"
278
- font = cv2.FONT_HERSHEY_SIMPLEX
279
- font_scale = 0.4
280
- thickness = 1
281
- text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
282
-
283
- # Draw background for text
284
- cv2.rectangle(stage_img, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), (0,0,0), -1)
285
- cv2.putText(stage_img, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
286
-
287
- # Mark best tile with thicker border
288
- best_idx = first_stage["best_idx"]
289
- if best_idx is not None and best_idx < len(split((0,0,1,1), rows, cols, ov, pad)):
290
- r = split((0,0,1,1), rows, cols, ov, pad)[best_idx]
291
- x,y,w,h = r
292
- H,W = img.shape[:2]
293
- x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
294
- cv2.rectangle(stage_img, (x1,y1), (x2,y2), (0,255,255), 3) # Yellow thick border
295
- else:
296
- stage_img = img.copy()
297
-
298
- # Create a summary of the results
299
- summary = []
300
- for i, res in enumerate(best_results):
301
- summary.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
302
-
303
- summary_text = "\n".join(summary)
304
-
305
- # Return results
306
- return crop_img, path_img, stage_img, summary_text
307
- except Exception as e:
308
- # Handle any other exceptions
309
- return (None, None, None, f"Error: {str(e)}")
310
 
311
- # ─────────── UI ───────────
312
- with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") as demo:
313
  gr.Markdown("""
314
- # πŸ¦… Eagle-Eyes Search
315
 
316
- Upload an image to find humans using recursive zoom technology. The system divides the image
317
- into a grid and recursively zooms into the most promising regions.
318
-
319
- How it works:
320
- 1. The image is divided into a grid based on your settings
321
- 2. Each grid cell is analyzed for human presence
322
- 3. The most promising cell is selected for the next zoom level
323
- 4. This process repeats for the specified number of zoom levels
324
  """)
325
 
326
  with gr.Row():
327
  with gr.Column(scale=1):
328
  img_in = gr.Image(type="pil", label="Input Image")
329
-
330
- # API Key input (password field)
331
  api_key = gr.Textbox(
332
  label="OpenAI API Key",
333
  placeholder="Enter your OpenAI API key here...",
334
  type="password",
335
  info="Your API key will be used only for this session and not stored"
336
  )
337
-
338
- # Model selection dropdown
339
  model = gr.Dropdown(
340
  choices=AVAILABLE_MODELS,
341
  value=DEFAULT_MODEL,
342
  label="Model Selection",
343
  info="Select the OpenAI model to use for analysis"
344
  )
345
-
346
  with gr.Group():
347
  with gr.Row():
348
- row = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
349
- col = gr.Number(value=DEFAULTS["col"], label="Grid Columns", precision=0, minimum=1, maximum=10)
350
- zoom = gr.Number(value=DEFAULTS["zoom"], label="Zoom Levels", precision=0, minimum=1, maximum=3)
351
-
352
  with gr.Row():
353
- ov = gr.Slider(0, 0.9, step=0.05, value=DEFAULTS["overlap"], label="Tile Overlap")
354
- pad = gr.Slider(0, 0.3, step=0.01, value=DEFAULTS["pad"], label="Tile Padding")
355
-
356
  btn = gr.Button("πŸ” Run Search", variant="primary")
357
-
358
  summary_out = gr.Textbox(label="Results Summary", lines=5, interactive=False)
359
-
360
- with gr.Column(scale=2):
361
- with gr.Tab("Final Crop"):
362
- crop_out = gr.Image(label="Final Crop (Most Likely Human Location)")
363
-
364
- with gr.Tab("Search Path"):
365
- path_out = gr.Image(label="Search Path (Colored by Zoom Level)")
366
-
367
- with gr.Tab("First Stage Analysis"):
368
- stage_out = gr.Image(label="First Stage Grid Analysis")
369
-
370
  gr.Markdown("""
371
  ### Tips for Best Results
372
 
373
- - **OpenAI API Key**: Required to use this tool. Your key remains private and is not stored
374
- - **Model Selection**: Choose the appropriate OpenAI model:
375
- - `gpt-4o`: Best overall performance for general vision tasks (recommended)
376
- - `gpt-4o-mini`: More economical version of gpt-4o with good performance
377
- - `o1`: Advanced reasoning model, excellent for complex analysis
378
- - `o1-mini`: Smaller, faster version of o1
379
- - `o3-mini`: Newest reasoning model (Jan 2025), optimized for STEM tasks
380
- - `gpt-4-vision-preview`: Original vision model (being deprecated)
381
- - `gpt-4-turbo`: Older model with vision capabilities
382
- - **Grid Size**: More rows/columns give better precision but require more API calls
383
- - **Zoom Levels**: More levels allow deeper searching in complex images
384
- - **Overlap**: Higher overlap prevents missing objects at tile boundaries
385
- - **Padding**: Reduces edge artifacts in grid cells
386
-
387
- This tool uses OpenAI's vision API to analyze image regions and detect human presence.
388
  """)
389
-
390
- btn.click(run_pipeline,
391
- inputs=[img_in, api_key, model, row, col, zoom, ov, pad],
392
- outputs=[crop_out, path_out, stage_out, summary_out])
 
 
393
 
394
  if __name__ == "__main__":
395
- demo.launch()
 
1
+ import asyncio, base64, json, os, tempfile
2
  from pathlib import Path
3
 
4
  import cv2
5
  import gradio as gr
6
  import numpy as np
7
+ from PIL import Image
8
  from dotenv import load_dotenv
9
  from openai import OpenAI
10
 
11
  # ─────────── ENV + DEFAULTS ───────────
12
  load_dotenv()
13
+ def _env(k, d=""):
14
+ return os.getenv(k, d).split("#", 1)[0].strip()
15
 
 
16
  DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
17
 
 
18
  AVAILABLE_MODELS = [
19
+ "gpt-4o", # Current flagship model with vision support
20
+ "gpt-4o-mini", # More economical version of gpt-4o with vision support
21
  "o1", # Advanced reasoning model with vision support
22
+ "o1-mini", # Smaller, faster version (if needed)
23
  "o3-mini", # Newest reasoning model (Jan 2025)
24
  "gpt-4-vision-preview", # Original vision model (being deprecated)
25
  "gpt-4-turbo" # Older model with vision support
26
  ]
27
 
28
  DEFAULTS = dict(
29
+ row = int(_env("ROW_COUNT", 7)),
30
+ col = int(_env("COL_COUNT", 7)),
31
+ zoom = int(_env("ZOOM_LEVELS", 2)), # Recursion depth (zoom levels)
32
+ overlap = 0.0, # Fixed at 0 as requested
33
+ pad = 0.0, # Fixed at 0 as requested
34
+ max_candidates = int(_env("MAX_CANDIDATES", 3)) # Maximum number of candidates per search
35
  )
36
 
37
+ # ─────────── PROMPT FOR GRID CELL ANALYSIS ───────────
38
  DEFAULT_PROMPT = (
39
+ "You are a vision inspector. Look at the image and determine if a human is present. "
40
+ "Partial visibility is acceptableβ€”consider clues like limbs, clothing, silhouettes, shadows, or partial faces. "
 
 
 
 
41
  "Respond strictly with valid JSON in the following format:\n"
42
+ '{"detected":"YES/NO/MAYBE", "confidence":<float between 0 and 1>, "reason":"<15 words max>"}\n'
43
+ "- YES: Clearly visible human feature(s) are observed.\n"
44
+ "- MAYBE: Ambiguous or partial evidence is present.\n"
45
+ "- NO: No evidence of a human is detected."
 
 
46
  )
47
 
 
 
48
  # ─────────── HELPERS ───────────
49
  def encode(img):
50
+ """Encode image to a Base64 string."""
 
51
  encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
52
  _, buf = cv2.imencode(".jpg", img, encode_params)
53
  return base64.b64encode(buf).decode()
54
 
55
+ def crop(img, r):
56
+ """Crop image to region r=(x,y,w,h) in relative coordinates."""
57
+ H, W = img.shape[:2]
58
+ x, y, w, h = r
59
+ return img[int(y * H):int((y + h) * H), int(x * W):int((x + w) * W)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ def split(r, rows, cols, ov=0.0, pad=0.0):
62
+ """
63
+ Split region r=(x,y,w,h) into a grid of subregions with specified rows and columns.
64
+ Overlap and padding are fixed at 0 as configured.
65
+ """
66
+ x0, y0, w, h = r
67
+ tw, th = w / cols, h / rows
68
+ sx, sy = tw, th # no overlap since ov=0.0
69
+ tiles = []
70
+ # Calculate number of grid cells
71
+ nx = max(1, int((w - tw) // sx) + 1)
72
+ ny = max(1, int((h - th) // sy) + 1)
73
  for ry in range(ny):
74
  for cx in range(nx):
75
+ sx0 = min(x0 + cx * sx, x0 + w - tw)
76
+ sy0 = min(y0 + ry * sy, y0 + h - th)
77
+ tiles.append((sx0, sy0, tw, th))
 
78
  return tiles
79
 
80
+ def rank(det):
81
+ """Rank the detection result."""
82
+ return {"YES": 0, "MAYBE": 1}.get(det, 2)
 
 
 
 
 
 
83
 
84
  def draw_path(img, path, results=None):
85
+ """Draw the search path on the image with rectangles for each stage."""
86
  out = img.copy()
87
+ STAGE_COLOURS = [(0, 165, 255), (0, 255, 0), (255, 0, 0), (255, 255, 0), (128, 0, 128)]
88
  for i, r in enumerate(path):
89
+ x, y, w, h = r
90
+ H, W = img.shape[:2]
91
+ x1, y1 = int(x * W), int(y * H)
92
+ x2, y2 = int((x + w) * W), int((y + h) * H)
93
  color = STAGE_COLOURS[i % len(STAGE_COLOURS)]
94
+ cv2.rectangle(out, (x1, y1), (x2, y2), color, 2)
 
 
95
  label = f"S{i+1}"
96
  if results and i < len(results):
97
  res = results[i]
 
99
  det = res["detected"]
100
  conf = res.get("confidence", 0)
101
  label += f": {det} ({conf:.2f})"
 
 
102
  font = cv2.FONT_HERSHEY_SIMPLEX
103
  font_scale = 0.5
104
  thickness = 1
105
  text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
106
+ cv2.rectangle(out, (x1, y1 - text_size[1] - 5), (x1 + text_size[0] + 5, y1), color, -1)
107
+ cv2.putText(out, label, (x1 + 2, y1 - 5), font, font_scale, (255, 255, 255), thickness)
 
 
 
108
  return out
109
 
110
+ # ─────────── API CALL FOR A SINGLE GRID CELL ───────────
111
+ async def ask_api(img, api_key, model="gpt-4o", custom_prompt=None):
112
+ """Send one grid cell image to the OpenAI API and return the result."""
113
+ client = OpenAI(api_key=api_key)
114
+ prompt = custom_prompt or DEFAULT_PROMPT
115
+ msg = [{
116
+ "role": "user",
117
+ "content": [
118
+ {"type": "text", "text": prompt},
119
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode(img)}"}}
120
+ ]
121
+ }]
122
+ delay = 1
123
+ for attempt in range(5):
124
+ try:
125
+ response = await asyncio.to_thread(
126
+ client.chat.completions.create,
127
+ model=model,
128
+ messages=msg,
129
+ max_tokens=60,
130
+ response_format={"type": "json_object"}
131
+ )
132
+ return json.loads(response.choices[0].message.content)
133
+ except Exception as e:
134
+ if "rate limit" in str(e).lower():
135
+ await asyncio.sleep(delay)
136
+ delay = min(delay * 2, 32)
137
+ else:
138
+ return {"detected": "NO", "confidence": 0, "reason": f"Error: {str(e)[:50]}..."}
139
+ return {"detected": "NO", "confidence": 0, "reason": "Too many API retries, please try again later"}
140
+
141
+ # ─────────── RECURSIVE SEARCH FUNCTION (MULTI-CANDIDATE) ───────────
142
+ async def recurse_multi(img, region, depth, rows, cols, prog, api_key, model, max_candidates, all_results=None):
143
  """
144
+ Recursively analyze grid cells, allowing up to max_candidates per stage.
145
+ Returns a list of branch dictionaries with keys:
146
+ - "final_region": final region in the branch,
147
+ - "path": list of regions (from higher to lower levels),
148
+ - "stage_results": list of API results per stage.
149
  """
150
  if all_results is None:
151
  all_results = []
152
 
153
  if depth == 0:
154
+ return [{"final_region": region, "path": [], "stage_results": []}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
+ subs = split(region, rows, cols, ov=0.0, pad=0.0)
157
+ prog(0, desc=f"Stage {depth}: scanning {len(subs)} grid cells...")
158
+ tasks = []
159
+ for sub in subs:
160
+ crop_img = crop(img, sub)
161
+ tasks.append(ask_api(crop_img, api_key, model))
162
+ results = await asyncio.gather(*tasks)
163
 
164
+ # Pair each subregion with its result
165
+ sub_results = list(zip(subs, results))
166
+ # Sort by (rank, -confidence)
167
+ sub_results.sort(key=lambda tup: (rank(tup[1]["detected"]), -tup[1].get("confidence", 0)))
168
 
169
+ # Select candidates with positive detection ("YES" or "MAYBE"); if none, take best candidate
170
+ candidates = [tup for tup in sub_results if tup[1]["detected"] in ("YES", "MAYBE")]
171
+ if not candidates:
172
+ candidates = [sub_results[0]]
173
+ candidates = candidates[:max_candidates]
174
 
175
+ branches = []
176
+ for candidate_region, candidate_result in candidates:
177
+ # For current candidate, record its stage result
178
+ current_stage = {"region": candidate_region, "result": candidate_result}
179
+ # Recursively search within candidate region
180
+ child_branches = await recurse_multi(img, candidate_region, depth - 1, rows, cols, prog, api_key, model, max_candidates)
181
+ for branch in child_branches:
182
+ branch["path"].insert(0, candidate_region)
183
+ branch["stage_results"].insert(0, candidate_result)
184
+ branches.append(branch)
185
+ return branches
186
 
187
+ # ─────────── PIPELINE FUNCTION ───────────
188
+ def run_pipeline(pil_img, api_key, model, rows, cols, zoom, max_candidates, progress=gr.Progress()):
189
+ """
190
+ Process a single uploaded image:
191
+ 1. Divide the image into grid cells.
192
+ 2. Recursively zoom in by exploring up to max_candidates per stage.
193
+ 3. Draw the search path on the original image.
194
+ 4. Return the final cropped region (from the best branch), its search path, and a summary.
195
+ """
196
  error_message = None
 
 
197
  if pil_img is None:
198
  error_message = "Error: Please upload an image to analyze."
 
199
  elif not api_key or api_key.strip() == "":
200
+ error_message = "Error: OpenAI API key is required."
 
201
  elif not model or model.strip() == "":
202
  error_message = "Error: Please select an OpenAI model."
 
203
  if error_message:
204
+ return None, None, error_message
205
+
 
206
  try:
207
+ img_np = np.array(pil_img)
208
+ except Exception as e:
209
+ return None, None, f"Error converting image: {str(e)}"
 
 
 
 
210
 
211
+ full_region = (0, 0, 1, 1)
212
+ progress(0, desc=f"Starting recursive grid search using {model}...")
213
+ branches = asyncio.run(
214
+ recurse_multi(img_np, full_region, zoom, rows, cols, progress, api_key, model, max_candidates)
215
+ )
216
+
217
+ if not branches:
218
+ return None, None, "No branch found."
219
+
220
+ # Select the branch with the highest confidence in its most zoomed-in stage
221
+ best_branch = max(branches, key=lambda b: b["stage_results"][0].get("confidence", 0))
222
+ final_reg = best_branch["final_region"] if "final_region" in best_branch else best_branch["path"][0]
223
+ final_crop = crop(img_np, final_reg)
224
+ final_crop_pil = Image.fromarray(final_crop)
225
+
226
+ # Draw search path using the branch's path and stage_results (reverse to show top-level first)
227
+ path_order = list(reversed(best_branch["path"]))
228
+ stage_results_order = list(reversed(best_branch["stage_results"]))
229
+ path_img = draw_path(img_np, path_order, stage_results_order)
230
+ path_img_pil = Image.fromarray(path_img)
231
+
232
+ # Build summary text for the branch
233
+ summary_lines = []
234
+ for i, res in enumerate(stage_results_order):
235
+ summary_lines.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
236
+ summary_text = "\n".join(summary_lines)
237
+
238
+ return final_crop_pil, path_img_pil, summary_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
+ # ─────────── GRADIO UI ───────────
241
+ with gr.Blocks(title="Eagle‑Eyes Recursive Grid Search", css="footer {visibility: hidden}") as demo:
242
  gr.Markdown("""
243
+ # πŸ¦… Eagle‑Eyes Recursive Grid Search
244
 
245
+ Upload a single image. The tool will divide the image into grid cells and recursively zoom in on the most promising region.
246
+ At each stage, up to a configurable number of positive candidates are explored. The search path is drawn on the original image.
 
 
 
 
 
 
247
  """)
248
 
249
  with gr.Row():
250
  with gr.Column(scale=1):
251
  img_in = gr.Image(type="pil", label="Input Image")
 
 
252
  api_key = gr.Textbox(
253
  label="OpenAI API Key",
254
  placeholder="Enter your OpenAI API key here...",
255
  type="password",
256
  info="Your API key will be used only for this session and not stored"
257
  )
 
 
258
  model = gr.Dropdown(
259
  choices=AVAILABLE_MODELS,
260
  value=DEFAULT_MODEL,
261
  label="Model Selection",
262
  info="Select the OpenAI model to use for analysis"
263
  )
 
264
  with gr.Group():
265
  with gr.Row():
266
+ row = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
267
+ col = gr.Number(value=DEFAULTS["col"], label="Grid Columns", precision=0, minimum=1, maximum=10)
268
+ zoom = gr.Number(value=DEFAULTS["zoom"], label="Zoom Levels", precision=0, minimum=1, maximum=5)
 
269
  with gr.Row():
270
+ max_candidates = gr.Number(value=DEFAULTS["max_candidates"], label="Max Candidates per Stage", precision=0, minimum=1, maximum=10)
 
 
271
  btn = gr.Button("πŸ” Run Search", variant="primary")
 
272
  summary_out = gr.Textbox(label="Results Summary", lines=5, interactive=False)
273
+ with gr.Column(scale=1):
274
+ crop_out = gr.Image(label="Final Crop (Zoomed Region)")
275
+ path_out = gr.Image(label="Search Path Visualization")
276
+
 
 
 
 
 
 
 
277
  gr.Markdown("""
278
  ### Tips for Best Results
279
 
280
+ - **OpenAI API Key**: Required for this tool. Your key remains private.
281
+ - **Model Selection**: Choose a model with vision support (e.g., `gpt-4o`, `gpt-4o-mini`, `o1`, etc.).
282
+ - **Grid Settings**: Adjust rows and columns to fine-tune segmentation.
283
+ - **Zoom Levels**: More zoom levels perform deeper recursive search.
284
+ - **Max Candidates per Stage**: Controls how many positive grid cells to explore at each stage.
 
 
 
 
 
 
 
 
 
 
285
  """)
286
+
287
+ btn.click(
288
+ run_pipeline,
289
+ inputs=[img_in, api_key, model, row, col, zoom, max_candidates],
290
+ outputs=[crop_out, path_out, summary_out]
291
+ )
292
 
293
  if __name__ == "__main__":
294
+ demo.launch()