Spaces:

kalhar
/

EagleEye

Runtime error

App Files Files Community

Kalhar.Pandya commited on Apr 7, 2025

Commit

81ce8e4

1 Parent(s): fad01b2

inaccurate

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +154 -98

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: blue
 colorTo: indigo
 sdk: gradio
 sdk_version: 3.50.0
-python_version: 3.10
 app_file: app.py
 pinned: false
 suggested_hardware: cpu-basic

 colorTo: indigo
 sdk: gradio
 sdk_version: 3.50.0
+python_version: 3.12
 app_file: app.py
 pinned: false
 suggested_hardware: cpu-basic

app.py CHANGED Viewed

@@ -11,8 +11,19 @@ from openai import OpenAI
 load_dotenv()
 def _env(k, d=""): return os.getenv(k, d).split("#", 1)[0].strip()
-# API key will be provided through the UI instead of env variable
-MODEL = _env("OPENAI_MODEL", "gpt-4o")
 DEFAULTS = dict(
     row     = int(_env("ROW_COUNT",     7)),
@@ -23,23 +34,32 @@ DEFAULTS = dict(
 )
 DEFAULT_PROMPT = (
-    "You are a vision inspector. Look at the image and determine if a human is present. "
-    "Partial visibility is acceptable — consider visual clues like visible limbs, clothing patterns, silhouettes, shadows, or partial faces. "
-    "Take your time to inspect the scene and make an informed decision. Respond strictly with valid JSON in the following format: "
-    '{"detected":"YES/NO/MAYBE","confidence":<float between 0 and 1>,"reason":"<15 words max>"}\n'
-    "- YES: Clearly visible body or part (face, arms, posture, etc).\n"
-    "- MAYBE: Suggestive shape or ambiguous signal (e.g., mannequin, shadow, blur).\n"
-    "- NO: No visual evidence of a person."
 )
 # ─────────── HELPERS ───────────
 def encode(img):
     """Encode image to base64 string"""
-    _, buf = cv2.imencode(".jpg", img)
     return base64.b64encode(buf).decode()
-async def ask_api(img, api_key, custom_prompt=None):
     """Ask OpenAI API about an image"""
     client = OpenAI(api_key=api_key)
     prompt = custom_prompt or DEFAULT_PROMPT
@@ -52,7 +72,7 @@ async def ask_api(img, api_key, custom_prompt=None):
         try:
             r = await asyncio.to_thread(
                 client.chat.completions.create,
-                model=MODEL, messages=msg, max_tokens=60,
                 response_format={"type":"json_object"}
             )
             return json.loads(r.choices[0].message.content)
@@ -124,7 +144,7 @@ def draw_path(img, path, results=None):
     return out
-async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, all_results=None):
     """
     Returns (final_region, path, results) where:
     - path is a list of chosen regions, one per stage
@@ -141,7 +161,7 @@ async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, all_re
     async def task(i, r):
         crop_img = crop(img, r)
-        result = await ask_api(crop_img, api_key)
         return i, result, r
     results = [None] * len(subs)
@@ -167,106 +187,126 @@ async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, all_re
     all_results.append(stage_results)
     final_reg, sub_path, all_res = await recurse(
-        img, best_region, depth - 1, rows, cols, ov, pad, prog, all_results
     )
     return final_reg, [best_region] + sub_path, all_res
 # ─────────── GRADIO PIPELINE ───────────
-def run_pipeline(pil_img, api_key, rows, cols, zoom, ov, pad, progress=gr.Progress()):
     """Main pipeline to process an image and find humans"""
     # Check for API key
-    if not api_key or api_key.strip() == "":
-        return (None, None, None, "Error: OpenAI API key is required to run the search.")
     # Input validation
-    rows = max(1, min(int(rows), 10))
-    cols = max(1, min(int(cols), 10))
-    zoom = max(1, min(int(zoom), 3))
-    ov = max(0, min(ov, 0.9))
-    pad = max(0, min(pad, 0.3))
     with tempfile.TemporaryDirectory() as td:
-        img_path = str(Path(td) / "in.jpg")
-        pil_img.save(img_path)
-        # Convert to RGB after reading (OpenCV reads as BGR)
-        img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
-        progress(0, desc="Starting recursive search...")
-        final_reg, path, all_results = asyncio.run(
-            recurse(img, (0, 0, 1, 1), zoom, rows, cols, ov, pad, progress, api_key)
-        )
-        # Get final crop (already in RGB)
-        crop_img = crop(img, final_reg)
-        # Collect results for each stage
-        stage_results = []
-        best_results = []
-        for stage in all_results:
-            best_idx = stage["best_idx"]
-            results = stage["results"]
-            if best_idx is not None and best_idx < len(results):
-                best_results.append(results[best_idx])
-        # Create the path visualization (already in RGB)
-        path_img = draw_path(img, path, best_results)
-        # Create visualization of all tiles in the first stage
-        if all_results and len(all_results) > 0:
-            first_stage = all_results[0]
-            stage_img = img.copy()
-            for i, r in enumerate(first_stage["results"]):
-                region = first_stage["region"] if i == first_stage["best_idx"] else None
-                x,y,w,h = split(region or (0,0,1,1), rows, cols, ov, pad)[i]
-                H,W = img.shape[:2]
-                x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
-                # Color based on detection
-                if r["detected"] == "YES":
-                    color = (0, 255, 0)  # Green
-                elif r["detected"] == "MAYBE":
-                    color = (0, 165, 255)  # Orange
-                else:
-                    color = (255, 0, 0)  # Red (in RGB)
-                # Draw rectangle with confidence
-                cv2.rectangle(stage_img, (x1,y1), (x2,y2), color, 1)
-                conf = r.get("confidence", 0)
-                # Add text with background
-                label = f"{r['detected']} ({conf:.2f})"
-                font = cv2.FONT_HERSHEY_SIMPLEX
-                font_scale = 0.4
-                thickness = 1
-                text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
-                # Draw background for text
-                cv2.rectangle(stage_img, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), (0,0,0), -1)
-                cv2.putText(stage_img, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
-            # Mark best tile with thicker border
-            best_idx = first_stage["best_idx"]
-            if best_idx is not None and best_idx < len(split((0,0,1,1), rows, cols, ov, pad)):
-                r = split((0,0,1,1), rows, cols, ov, pad)[best_idx]
-                x,y,w,h = r
-                H,W = img.shape[:2]
-                x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
-                cv2.rectangle(stage_img, (x1,y1), (x2,y2), (0,255,255), 3)  # Yellow thick border
-        else:
-            stage_img = img.copy()
-        # Create a summary of the results
-        summary = []
-        for i, res in enumerate(best_results):
-            summary.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
-        summary_text = "\n".join(summary)
-        # Return results
-        return crop_img, path_img, stage_img, summary_text
 # ─────────── UI ───────────
 with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") as demo:
@@ -295,6 +335,14 @@ with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") a
                 info="Your API key will be used only for this session and not stored"
             )
             with gr.Group():
                 with gr.Row():
                     row  = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
@@ -323,6 +371,14 @@ with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") a
     ### Tips for Best Results
     - **OpenAI API Key**: Required to use this tool. Your key remains private and is not stored
     - **Grid Size**: More rows/columns give better precision but require more API calls
     - **Zoom Levels**: More levels allow deeper searching in complex images
     - **Overlap**: Higher overlap prevents missing objects at tile boundaries
@@ -332,7 +388,7 @@ with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") a
     """)
     btn.click(run_pipeline,
-              inputs=[img_in, api_key, row, col, zoom, ov, pad],
               outputs=[crop_out, path_out, stage_out, summary_out])
 if __name__ == "__main__":

 load_dotenv()
 def _env(k, d=""): return os.getenv(k, d).split("#", 1)[0].strip()
+# API key and model will be provided through the UI
+DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
+# Available models for dropdown selection - all support vision capabilities
+AVAILABLE_MODELS = [
+    "gpt-4o",                     # Current flagship model (most recommended)
+    "gpt-4o-mini",                # More economical version of gpt-4o
+    "o1",                         # Advanced reasoning model with vision support
+    "o1-mini",                    # Smaller, faster version of o1
+    "o3-mini",                    # Newest reasoning model (Jan 2025)
+    "gpt-4-vision-preview",       # Original vision model (being deprecated)
+    "gpt-4-turbo"                 # Older model with vision support
+]
 DEFAULTS = dict(
     row     = int(_env("ROW_COUNT",     7)),
 )
 DEFAULT_PROMPT = (
+    "You are a highly detailed vision inspector specialized in human detection from aerial imagery. "
+    "You are provided with an image that may be divided into grid cells, each labeled with a unique number. "
+    "Your task is to examine the entire image (or each grid cell) and determine whether there is any sign of a human presence. "
+    "Partial visibility is acceptable—look for any visible human features such as limbs, faces, clothing, or distinct shadows and silhouettes that contrast with natural surroundings. "
+    "Consider unusual color patterns, shapes, or textures that might indicate a person, even if partially obscured by vegetation or terrain. "
+    "Take your time to analyze all clues carefully, and if there is any doubt, mention your top candidate grid cell(s). "
+    "Respond strictly with valid JSON in the following format:\n"
+    "  {\"detected\":\"YES/NO/MAYBE\", \"confidence\":<float between 0 and 1>, \"reason\":\"<15 words max>\"}\n"
+    "For example, if a grid cell shows a clear human silhouette with contrasting clothing, your response might be:\n"
+    "  {\"detected\":\"YES\", \"confidence\":0.87, \"reason\":\"Clear human figure in grid cell 23 with distinct clothing and shadow.\"}\n"
+    "- YES: A human or clear human-like feature is observed.\n"
+    "- MAYBE: Ambiguous or partial human evidence is present.\n"
+    "- NO: No evidence of human presence is detected."
 )
 # ─────────── HELPERS ───────────
 def encode(img):
     """Encode image to base64 string"""
+    # Set JPEG quality to higher value for better image quality
+    encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
+    _, buf = cv2.imencode(".jpg", img, encode_params)
     return base64.b64encode(buf).decode()
+async def ask_api(img, api_key, model="gpt-4o", custom_prompt=None):
     """Ask OpenAI API about an image"""
     client = OpenAI(api_key=api_key)
     prompt = custom_prompt or DEFAULT_PROMPT
         try:
             r = await asyncio.to_thread(
                 client.chat.completions.create,
+                model=model, messages=msg, max_tokens=60,
                 response_format={"type":"json_object"}
             )
             return json.loads(r.choices[0].message.content)
     return out
+async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, model, all_results=None):
     """
     Returns (final_region, path, results) where:
     - path is a list of chosen regions, one per stage
     async def task(i, r):
         crop_img = crop(img, r)
+        result = await ask_api(crop_img, api_key, model)
         return i, result, r
     results = [None] * len(subs)
     all_results.append(stage_results)
     final_reg, sub_path, all_res = await recurse(
+        img, best_region, depth - 1, rows, cols, ov, pad, prog, api_key, model, all_results
     )
     return final_reg, [best_region] + sub_path, all_res
 # ─────────── GRADIO PIPELINE ───────────
+def run_pipeline(pil_img, api_key, model, rows, cols, zoom, ov, pad, progress=gr.Progress()):
     """Main pipeline to process an image and find humans"""
+    # Input validation and error checking
+    error_message = None
+    # Check if image was provided
+    if pil_img is None:
+        error_message = "Error: Please upload an image to analyze."
     # Check for API key
+    elif not api_key or api_key.strip() == "":
+        error_message = "Error: OpenAI API key is required to run the search."
+    # Check if model is selected
+    elif not model or model.strip() == "":
+        error_message = "Error: Please select an OpenAI model."
+    if error_message:
+        return (None, None, None, error_message)
     # Input validation
+    try:
+        rows = max(1, min(int(rows), 10))
+        cols = max(1, min(int(cols), 10))
+        zoom = max(1, min(int(zoom), 3))
+        ov = max(0, min(float(ov), 0.9))
+        pad = max(0, min(float(pad), 0.3))
+    except (ValueError, TypeError):
+        return (None, None, None, "Error: Invalid parameter values. Using defaults instead.")
     with tempfile.TemporaryDirectory() as td:
+        try:
+            img_path = str(Path(td) / "in.jpg")
+            pil_img.save(img_path)
+            # Convert to RGB after reading (OpenCV reads as BGR)
+            img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
+            progress(0, desc=f"Starting recursive search using {model}...")
+            final_reg, path, all_results = asyncio.run(
+                recurse(img, (0, 0, 1, 1), zoom, rows, cols, ov, pad, progress, api_key, model)
+            )
+            # Get final crop (already in RGB)
+            crop_img = crop(img, final_reg)
+            # Collect results for each stage
+            stage_results = []
+            best_results = []
+            for stage in all_results:
+                best_idx = stage["best_idx"]
+                results = stage["results"]
+                if best_idx is not None and best_idx < len(results):
+                    best_results.append(results[best_idx])
+            # Create the path visualization (already in RGB)
+            path_img = draw_path(img, path, best_results)
+            # Create visualization of all tiles in the first stage
+            if all_results and len(all_results) > 0:
+                first_stage = all_results[0]
+                stage_img = img.copy()
+                for i, r in enumerate(first_stage["results"]):
+                    region = first_stage["region"] if i == first_stage["best_idx"] else None
+                    x,y,w,h = split(region or (0,0,1,1), rows, cols, ov, pad)[i]
+                    H,W = img.shape[:2]
+                    x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
+                    # Color based on detection
+                    if r["detected"] == "YES":
+                        color = (0, 255, 0)  # Green
+                    elif r["detected"] == "MAYBE":
+                        color = (0, 165, 255)  # Orange
+                    else:
+                        color = (255, 0, 0)  # Red (in RGB)
+                    # Draw rectangle with confidence
+                    cv2.rectangle(stage_img, (x1,y1), (x2,y2), color, 1)
+                    conf = r.get("confidence", 0)
+                    # Add text with background
+                    label = f"{r['detected']} ({conf:.2f})"
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    font_scale = 0.4
+                    thickness = 1
+                    text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
+                    # Draw background for text
+                    cv2.rectangle(stage_img, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), (0,0,0), -1)
+                    cv2.putText(stage_img, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
+                # Mark best tile with thicker border
+                best_idx = first_stage["best_idx"]
+                if best_idx is not None and best_idx < len(split((0,0,1,1), rows, cols, ov, pad)):
+                    r = split((0,0,1,1), rows, cols, ov, pad)[best_idx]
+                    x,y,w,h = r
+                    H,W = img.shape[:2]
+                    x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
+                    cv2.rectangle(stage_img, (x1,y1), (x2,y2), (0,255,255), 3)  # Yellow thick border
+            else:
+                stage_img = img.copy()
+            # Create a summary of the results
+            summary = []
+            for i, res in enumerate(best_results):
+                summary.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
+            summary_text = "\n".join(summary)
+            # Return results
+            return crop_img, path_img, stage_img, summary_text
+        except Exception as e:
+            # Handle any other exceptions
+            return (None, None, None, f"Error: {str(e)}")
 # ─────────── UI ───────────
 with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") as demo:
                 info="Your API key will be used only for this session and not stored"
             )
+            # Model selection dropdown
+            model = gr.Dropdown(
+                choices=AVAILABLE_MODELS,
+                value=DEFAULT_MODEL,
+                label="Model Selection",
+                info="Select the OpenAI model to use for analysis"
+            )
             with gr.Group():
                 with gr.Row():
                     row  = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
     ### Tips for Best Results
     - **OpenAI API Key**: Required to use this tool. Your key remains private and is not stored
+    - **Model Selection**: Choose the appropriate OpenAI model:
+      - `gpt-4o`: Best overall performance for general vision tasks (recommended)
+      - `gpt-4o-mini`: More economical version of gpt-4o with good performance
+      - `o1`: Advanced reasoning model, excellent for complex analysis
+      - `o1-mini`: Smaller, faster version of o1
+      - `o3-mini`: Newest reasoning model (Jan 2025), optimized for STEM tasks
+      - `gpt-4-vision-preview`: Original vision model (being deprecated)
+      - `gpt-4-turbo`: Older model with vision capabilities
     - **Grid Size**: More rows/columns give better precision but require more API calls
     - **Zoom Levels**: More levels allow deeper searching in complex images
     - **Overlap**: Higher overlap prevents missing objects at tile boundaries
     """)
     btn.click(run_pipeline,
+              inputs=[img_in, api_key, model, row, col, zoom, ov, pad],
               outputs=[crop_out, path_out, stage_out, summary_out])
 if __name__ == "__main__":