EagleEye / app.py
Kalhar.Pandya
final
05bac69
import asyncio, base64, json, os, tempfile
from pathlib import Path
import cv2
import gradio as gr
import numpy as np
from PIL import Image
from dotenv import load_dotenv
from openai import OpenAI
# ─────────── ENV + DEFAULTS ───────────
load_dotenv()
def _env(k, d=""):
return os.getenv(k, d).split("#", 1)[0].strip()
DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
AVAILABLE_MODELS = [
"gpt-4o", # Current flagship model with vision support
"gpt-4o-mini", # More economical version of gpt-4o with vision support
"o1", # Advanced reasoning model with vision support
"o1-mini", # Smaller, faster version (if needed)
"o3-mini", # Newest reasoning model (Jan 2025)
"gpt-4-vision-preview", # Original vision model (being deprecated)
"gpt-4-turbo" # Older model with vision support
]
DEFAULTS = dict(
row = int(_env("ROW_COUNT", 7)),
col = int(_env("COL_COUNT", 7)),
zoom = int(_env("ZOOM_LEVELS", 2)), # Recursion depth (zoom levels)
overlap = 0.0, # Fixed at 0 as requested
pad = 0.0, # Fixed at 0 as requested
max_candidates = int(_env("MAX_CANDIDATES", 3)) # Maximum number of candidates per search
)
# ─────────── PROMPT FOR GRID CELL ANALYSIS ───────────
DEFAULT_PROMPT = (
"You are a vision inspector. Look at the image and determine if a human is present. "
"Partial visibility is acceptableβ€”consider clues like limbs, clothing, silhouettes, shadows, or partial faces. "
"Respond strictly with valid JSON in the following format:\n"
'{"detected":"YES/NO/MAYBE", "confidence":<float between 0 and 1>, "reason":"<15 words max>"}\n'
"- YES: Clearly visible human feature(s) are observed.\n"
"- MAYBE: Ambiguous or partial evidence is present.\n"
"- NO: No evidence of a human is detected."
)
# ─────────── HELPERS ───────────
def encode(img):
"""Encode image to a Base64 string."""
encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
_, buf = cv2.imencode(".jpg", img, encode_params)
return base64.b64encode(buf).decode()
def crop(img, r):
"""Crop image to region r=(x,y,w,h) in relative coordinates."""
H, W = img.shape[:2]
x, y, w, h = r
return img[int(y * H):int((y + h) * H), int(x * W):int((x + w) * W)]
def split(r, rows, cols, ov=0.0, pad=0.0):
"""
Split region r=(x,y,w,h) into a grid of subregions with specified rows and columns.
Overlap and padding are fixed at 0 as configured.
"""
x0, y0, w, h = r
tw, th = w / cols, h / rows
sx, sy = tw, th # no overlap since ov=0.0
tiles = []
# Calculate number of grid cells
nx = max(1, int((w - tw) // sx) + 1)
ny = max(1, int((h - th) // sy) + 1)
for ry in range(ny):
for cx in range(nx):
sx0 = min(x0 + cx * sx, x0 + w - tw)
sy0 = min(y0 + ry * sy, y0 + h - th)
tiles.append((sx0, sy0, tw, th))
return tiles
def rank(det):
"""Rank the detection result."""
return {"YES": 0, "MAYBE": 1}.get(det, 2)
def draw_path(img, path, results=None):
"""Draw the search path on the image with rectangles for each stage."""
out = img.copy()
STAGE_COLOURS = [(0, 165, 255), (0, 255, 0), (255, 0, 0), (255, 255, 0), (128, 0, 128)]
for i, r in enumerate(path):
x, y, w, h = r
H, W = img.shape[:2]
x1, y1 = int(x * W), int(y * H)
x2, y2 = int((x + w) * W), int((y + h) * H)
color = STAGE_COLOURS[i % len(STAGE_COLOURS)]
cv2.rectangle(out, (x1, y1), (x2, y2), color, 2)
label = f"S{i+1}"
if results and i < len(results):
res = results[i]
if "detected" in res:
det = res["detected"]
conf = res.get("confidence", 0)
label += f": {det} ({conf:.2f})"
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
thickness = 1
text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
cv2.rectangle(out, (x1, y1 - text_size[1] - 5), (x1 + text_size[0] + 5, y1), color, -1)
cv2.putText(out, label, (x1 + 2, y1 - 5), font, font_scale, (255, 255, 255), thickness)
return out
# ─────────── API CALL FOR A SINGLE GRID CELL ───────────
async def ask_api(img, api_key, model="gpt-4o", custom_prompt=None):
"""Send one grid cell image to the OpenAI API and return the result."""
client = OpenAI(api_key=api_key)
prompt = custom_prompt or DEFAULT_PROMPT
msg = [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode(img)}"}}
]
}]
delay = 1
for attempt in range(5):
try:
response = await asyncio.to_thread(
client.chat.completions.create,
model=model,
messages=msg,
max_tokens=60,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
except Exception as e:
if "rate limit" in str(e).lower():
await asyncio.sleep(delay)
delay = min(delay * 2, 32)
else:
return {"detected": "NO", "confidence": 0, "reason": f"Error: {str(e)[:50]}..."}
return {"detected": "NO", "confidence": 0, "reason": "Too many API retries, please try again later"}
# ─────────── RECURSIVE SEARCH FUNCTION (MULTI-CANDIDATE) ───────────
async def recurse_multi(img, region, depth, rows, cols, prog, api_key, model, max_candidates, all_results=None):
"""
Recursively analyze grid cells, allowing up to max_candidates per stage.
Returns a list of branch dictionaries with keys:
- "final_region": final region in the branch,
- "path": list of regions (from higher to lower levels),
- "stage_results": list of API results per stage.
"""
if all_results is None:
all_results = []
if depth == 0:
return [{"final_region": region, "path": [], "stage_results": []}]
subs = split(region, rows, cols, ov=0.0, pad=0.0)
prog(0, desc=f"Stage {depth}: scanning {len(subs)} grid cells...")
tasks = []
for sub in subs:
crop_img = crop(img, sub)
tasks.append(ask_api(crop_img, api_key, model))
results = await asyncio.gather(*tasks)
# Pair each subregion with its result
sub_results = list(zip(subs, results))
# Sort by (rank, -confidence)
sub_results.sort(key=lambda tup: (rank(tup[1]["detected"]), -tup[1].get("confidence", 0)))
# Select candidates with positive detection ("YES" or "MAYBE"); if none, take best candidate
candidates = [tup for tup in sub_results if tup[1]["detected"] in ("YES", "MAYBE")]
if not candidates:
candidates = [sub_results[0]]
candidates = candidates[:max_candidates]
branches = []
for candidate_region, candidate_result in candidates:
# For current candidate, record its stage result
current_stage = {"region": candidate_region, "result": candidate_result}
# Recursively search within candidate region
child_branches = await recurse_multi(img, candidate_region, depth - 1, rows, cols, prog, api_key, model, max_candidates)
for branch in child_branches:
branch["path"].insert(0, candidate_region)
branch["stage_results"].insert(0, candidate_result)
branches.append(branch)
return branches
# ─────────── PIPELINE FUNCTION ───────────
def run_pipeline(pil_img, api_key, model, rows, cols, zoom, max_candidates, progress=gr.Progress()):
"""
Process a single uploaded image:
1. Divide the image into grid cells.
2. Recursively zoom in by exploring up to max_candidates per stage.
3. Draw the search path on the original image.
4. Return the final cropped region (from the best branch), its search path, and a summary.
"""
error_message = None
if pil_img is None:
error_message = "Error: Please upload an image to analyze."
elif not api_key or api_key.strip() == "":
error_message = "Error: OpenAI API key is required."
elif not model or model.strip() == "":
error_message = "Error: Please select an OpenAI model."
if error_message:
return None, None, error_message
try:
img_np = np.array(pil_img)
except Exception as e:
return None, None, f"Error converting image: {str(e)}"
full_region = (0, 0, 1, 1)
progress(0, desc=f"Starting recursive grid search using {model}...")
branches = asyncio.run(
recurse_multi(img_np, full_region, zoom, rows, cols, progress, api_key, model, max_candidates)
)
if not branches:
return None, None, "No branch found."
# Select the branch with the highest confidence in its most zoomed-in stage
best_branch = max(branches, key=lambda b: b["stage_results"][0].get("confidence", 0))
final_reg = best_branch["final_region"] if "final_region" in best_branch else best_branch["path"][0]
final_crop = crop(img_np, final_reg)
final_crop_pil = Image.fromarray(final_crop)
# Draw search path using the branch's path and stage_results (reverse to show top-level first)
path_order = list(reversed(best_branch["path"]))
stage_results_order = list(reversed(best_branch["stage_results"]))
path_img = draw_path(img_np, path_order, stage_results_order)
path_img_pil = Image.fromarray(path_img)
# Build summary text for the branch
summary_lines = []
for i, res in enumerate(stage_results_order):
summary_lines.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
summary_text = "\n".join(summary_lines)
return final_crop_pil, path_img_pil, summary_text
# ─────────── GRADIO UI ───────────
with gr.Blocks(title="Eagle‑Eyes Recursive Grid Search", css="footer {visibility: hidden}") as demo:
gr.Markdown("""
# πŸ¦… Eagle‑Eyes Recursive Grid Search
Upload a single image. The tool will divide the image into grid cells and recursively zoom in on the most promising region.
At each stage, up to a configurable number of positive candidates are explored. The search path is drawn on the original image.
""")
with gr.Row():
with gr.Column(scale=1):
img_in = gr.Image(type="pil", label="Input Image")
api_key = gr.Textbox(
label="OpenAI API Key",
placeholder="Enter your OpenAI API key here...",
type="password",
info="Your API key will be used only for this session and not stored"
)
model = gr.Dropdown(
choices=AVAILABLE_MODELS,
value=DEFAULT_MODEL,
label="Model Selection",
info="Select the OpenAI model to use for analysis"
)
with gr.Group():
with gr.Row():
row = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
col = gr.Number(value=DEFAULTS["col"], label="Grid Columns", precision=0, minimum=1, maximum=10)
zoom = gr.Number(value=DEFAULTS["zoom"], label="Zoom Levels", precision=0, minimum=1, maximum=5)
with gr.Row():
max_candidates = gr.Number(value=DEFAULTS["max_candidates"], label="Max Candidates per Stage", precision=0, minimum=1, maximum=10)
btn = gr.Button("πŸ” Run Search", variant="primary")
summary_out = gr.Textbox(label="Results Summary", lines=5, interactive=False)
with gr.Column(scale=1):
crop_out = gr.Image(label="Final Crop (Zoomed Region)")
path_out = gr.Image(label="Search Path Visualization")
gr.Markdown("""
### Tips for Best Results
- **OpenAI API Key**: Required for this tool. Your key remains private.
- **Model Selection**: Choose a model with vision support (e.g., `gpt-4o`, `gpt-4o-mini`, `o1`, etc.).
- **Grid Settings**: Adjust rows and columns to fine-tune segmentation.
- **Zoom Levels**: More zoom levels perform deeper recursive search.
- **Max Candidates per Stage**: Controls how many positive grid cells to explore at each stage.
""")
btn.click(
run_pipeline,
inputs=[img_in, api_key, model, row, col, zoom, max_candidates],
outputs=[crop_out, path_out, summary_out]
)
if __name__ == "__main__":
demo.launch()