Kalhar.Pandya commited on
Commit
81ce8e4
Β·
1 Parent(s): fad01b2

inaccurate

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +154 -98
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.50.0
8
- python_version: 3.10
9
  app_file: app.py
10
  pinned: false
11
  suggested_hardware: cpu-basic
 
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.50.0
8
+ python_version: 3.12
9
  app_file: app.py
10
  pinned: false
11
  suggested_hardware: cpu-basic
app.py CHANGED
@@ -11,8 +11,19 @@ from openai import OpenAI
11
  load_dotenv()
12
  def _env(k, d=""): return os.getenv(k, d).split("#", 1)[0].strip()
13
 
14
- # API key will be provided through the UI instead of env variable
15
- MODEL = _env("OPENAI_MODEL", "gpt-4o")
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  DEFAULTS = dict(
18
  row = int(_env("ROW_COUNT", 7)),
@@ -23,23 +34,32 @@ DEFAULTS = dict(
23
  )
24
 
25
  DEFAULT_PROMPT = (
26
- "You are a vision inspector. Look at the image and determine if a human is present. "
27
- "Partial visibility is acceptable β€” consider visual clues like visible limbs, clothing patterns, silhouettes, shadows, or partial faces. "
28
- "Take your time to inspect the scene and make an informed decision. Respond strictly with valid JSON in the following format: "
29
- '{"detected":"YES/NO/MAYBE","confidence":<float between 0 and 1>,"reason":"<15 words max>"}\n'
30
- "- YES: Clearly visible body or part (face, arms, posture, etc).\n"
31
- "- MAYBE: Suggestive shape or ambiguous signal (e.g., mannequin, shadow, blur).\n"
32
- "- NO: No visual evidence of a person."
 
 
 
 
 
 
33
  )
34
 
35
 
 
36
  # ─────────── HELPERS ───────────
37
  def encode(img):
38
  """Encode image to base64 string"""
39
- _, buf = cv2.imencode(".jpg", img)
 
 
40
  return base64.b64encode(buf).decode()
41
 
42
- async def ask_api(img, api_key, custom_prompt=None):
43
  """Ask OpenAI API about an image"""
44
  client = OpenAI(api_key=api_key)
45
  prompt = custom_prompt or DEFAULT_PROMPT
@@ -52,7 +72,7 @@ async def ask_api(img, api_key, custom_prompt=None):
52
  try:
53
  r = await asyncio.to_thread(
54
  client.chat.completions.create,
55
- model=MODEL, messages=msg, max_tokens=60,
56
  response_format={"type":"json_object"}
57
  )
58
  return json.loads(r.choices[0].message.content)
@@ -124,7 +144,7 @@ def draw_path(img, path, results=None):
124
 
125
  return out
126
 
127
- async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, all_results=None):
128
  """
129
  Returns (final_region, path, results) where:
130
  - path is a list of chosen regions, one per stage
@@ -141,7 +161,7 @@ async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, all_re
141
 
142
  async def task(i, r):
143
  crop_img = crop(img, r)
144
- result = await ask_api(crop_img, api_key)
145
  return i, result, r
146
 
147
  results = [None] * len(subs)
@@ -167,106 +187,126 @@ async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, all_re
167
  all_results.append(stage_results)
168
 
169
  final_reg, sub_path, all_res = await recurse(
170
- img, best_region, depth - 1, rows, cols, ov, pad, prog, all_results
171
  )
 
172
 
173
  return final_reg, [best_region] + sub_path, all_res
174
 
175
  # ─────────── GRADIO PIPELINE ───────────
176
- def run_pipeline(pil_img, api_key, rows, cols, zoom, ov, pad, progress=gr.Progress()):
177
  """Main pipeline to process an image and find humans"""
 
 
 
 
 
 
178
  # Check for API key
179
- if not api_key or api_key.strip() == "":
180
- return (None, None, None, "Error: OpenAI API key is required to run the search.")
 
 
 
 
 
 
181
 
182
  # Input validation
183
- rows = max(1, min(int(rows), 10))
184
- cols = max(1, min(int(cols), 10))
185
- zoom = max(1, min(int(zoom), 3))
186
- ov = max(0, min(ov, 0.9))
187
- pad = max(0, min(pad, 0.3))
 
 
 
188
 
189
  with tempfile.TemporaryDirectory() as td:
190
- img_path = str(Path(td) / "in.jpg")
191
- pil_img.save(img_path)
192
-
193
- # Convert to RGB after reading (OpenCV reads as BGR)
194
- img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
 
195
 
196
- progress(0, desc="Starting recursive search...")
197
- final_reg, path, all_results = asyncio.run(
198
- recurse(img, (0, 0, 1, 1), zoom, rows, cols, ov, pad, progress, api_key)
199
- )
200
 
201
- # Get final crop (already in RGB)
202
- crop_img = crop(img, final_reg)
203
-
204
- # Collect results for each stage
205
- stage_results = []
206
- best_results = []
207
- for stage in all_results:
208
- best_idx = stage["best_idx"]
209
- results = stage["results"]
210
- if best_idx is not None and best_idx < len(results):
211
- best_results.append(results[best_idx])
212
-
213
- # Create the path visualization (already in RGB)
214
- path_img = draw_path(img, path, best_results)
215
-
216
- # Create visualization of all tiles in the first stage
217
- if all_results and len(all_results) > 0:
218
- first_stage = all_results[0]
219
- stage_img = img.copy()
220
 
221
- for i, r in enumerate(first_stage["results"]):
222
- region = first_stage["region"] if i == first_stage["best_idx"] else None
223
- x,y,w,h = split(region or (0,0,1,1), rows, cols, ov, pad)[i]
224
- H,W = img.shape[:2]
225
- x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
226
-
227
- # Color based on detection
228
- if r["detected"] == "YES":
229
- color = (0, 255, 0) # Green
230
- elif r["detected"] == "MAYBE":
231
- color = (0, 165, 255) # Orange
232
- else:
233
- color = (255, 0, 0) # Red (in RGB)
234
-
235
- # Draw rectangle with confidence
236
- cv2.rectangle(stage_img, (x1,y1), (x2,y2), color, 1)
237
- conf = r.get("confidence", 0)
238
 
239
- # Add text with background
240
- label = f"{r['detected']} ({conf:.2f})"
241
- font = cv2.FONT_HERSHEY_SIMPLEX
242
- font_scale = 0.4
243
- thickness = 1
244
- text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
- # Draw background for text
247
- cv2.rectangle(stage_img, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), (0,0,0), -1)
248
- cv2.putText(stage_img, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
 
 
 
 
 
 
 
249
 
250
- # Mark best tile with thicker border
251
- best_idx = first_stage["best_idx"]
252
- if best_idx is not None and best_idx < len(split((0,0,1,1), rows, cols, ov, pad)):
253
- r = split((0,0,1,1), rows, cols, ov, pad)[best_idx]
254
- x,y,w,h = r
255
- H,W = img.shape[:2]
256
- x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
257
- cv2.rectangle(stage_img, (x1,y1), (x2,y2), (0,255,255), 3) # Yellow thick border
258
- else:
259
- stage_img = img.copy()
260
-
261
- # Create a summary of the results
262
- summary = []
263
- for i, res in enumerate(best_results):
264
- summary.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
265
-
266
- summary_text = "\n".join(summary)
267
-
268
- # Return results
269
- return crop_img, path_img, stage_img, summary_text
270
 
271
  # ─────────── UI ───────────
272
  with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") as demo:
@@ -295,6 +335,14 @@ with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") a
295
  info="Your API key will be used only for this session and not stored"
296
  )
297
 
 
 
 
 
 
 
 
 
298
  with gr.Group():
299
  with gr.Row():
300
  row = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
@@ -323,6 +371,14 @@ with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") a
323
  ### Tips for Best Results
324
 
325
  - **OpenAI API Key**: Required to use this tool. Your key remains private and is not stored
 
 
 
 
 
 
 
 
326
  - **Grid Size**: More rows/columns give better precision but require more API calls
327
  - **Zoom Levels**: More levels allow deeper searching in complex images
328
  - **Overlap**: Higher overlap prevents missing objects at tile boundaries
@@ -332,7 +388,7 @@ with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") a
332
  """)
333
 
334
  btn.click(run_pipeline,
335
- inputs=[img_in, api_key, row, col, zoom, ov, pad],
336
  outputs=[crop_out, path_out, stage_out, summary_out])
337
 
338
  if __name__ == "__main__":
 
11
  load_dotenv()
12
  def _env(k, d=""): return os.getenv(k, d).split("#", 1)[0].strip()
13
 
14
+ # API key and model will be provided through the UI
15
+ DEFAULT_MODEL = _env("OPENAI_MODEL", "gpt-4o")
16
+
17
+ # Available models for dropdown selection - all support vision capabilities
18
+ AVAILABLE_MODELS = [
19
+ "gpt-4o", # Current flagship model (most recommended)
20
+ "gpt-4o-mini", # More economical version of gpt-4o
21
+ "o1", # Advanced reasoning model with vision support
22
+ "o1-mini", # Smaller, faster version of o1
23
+ "o3-mini", # Newest reasoning model (Jan 2025)
24
+ "gpt-4-vision-preview", # Original vision model (being deprecated)
25
+ "gpt-4-turbo" # Older model with vision support
26
+ ]
27
 
28
  DEFAULTS = dict(
29
  row = int(_env("ROW_COUNT", 7)),
 
34
  )
35
 
36
  DEFAULT_PROMPT = (
37
+ "You are a highly detailed vision inspector specialized in human detection from aerial imagery. "
38
+ "You are provided with an image that may be divided into grid cells, each labeled with a unique number. "
39
+ "Your task is to examine the entire image (or each grid cell) and determine whether there is any sign of a human presence. "
40
+ "Partial visibility is acceptableβ€”look for any visible human features such as limbs, faces, clothing, or distinct shadows and silhouettes that contrast with natural surroundings. "
41
+ "Consider unusual color patterns, shapes, or textures that might indicate a person, even if partially obscured by vegetation or terrain. "
42
+ "Take your time to analyze all clues carefully, and if there is any doubt, mention your top candidate grid cell(s). "
43
+ "Respond strictly with valid JSON in the following format:\n"
44
+ " {\"detected\":\"YES/NO/MAYBE\", \"confidence\":<float between 0 and 1>, \"reason\":\"<15 words max>\"}\n"
45
+ "For example, if a grid cell shows a clear human silhouette with contrasting clothing, your response might be:\n"
46
+ " {\"detected\":\"YES\", \"confidence\":0.87, \"reason\":\"Clear human figure in grid cell 23 with distinct clothing and shadow.\"}\n"
47
+ "- YES: A human or clear human-like feature is observed.\n"
48
+ "- MAYBE: Ambiguous or partial human evidence is present.\n"
49
+ "- NO: No evidence of human presence is detected."
50
  )
51
 
52
 
53
+
54
  # ─────────── HELPERS ───────────
55
  def encode(img):
56
  """Encode image to base64 string"""
57
+ # Set JPEG quality to higher value for better image quality
58
+ encode_params = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
59
+ _, buf = cv2.imencode(".jpg", img, encode_params)
60
  return base64.b64encode(buf).decode()
61
 
62
+ async def ask_api(img, api_key, model="gpt-4o", custom_prompt=None):
63
  """Ask OpenAI API about an image"""
64
  client = OpenAI(api_key=api_key)
65
  prompt = custom_prompt or DEFAULT_PROMPT
 
72
  try:
73
  r = await asyncio.to_thread(
74
  client.chat.completions.create,
75
+ model=model, messages=msg, max_tokens=60,
76
  response_format={"type":"json_object"}
77
  )
78
  return json.loads(r.choices[0].message.content)
 
144
 
145
  return out
146
 
147
+ async def recurse(img, region, depth, rows, cols, ov, pad, prog, api_key, model, all_results=None):
148
  """
149
  Returns (final_region, path, results) where:
150
  - path is a list of chosen regions, one per stage
 
161
 
162
  async def task(i, r):
163
  crop_img = crop(img, r)
164
+ result = await ask_api(crop_img, api_key, model)
165
  return i, result, r
166
 
167
  results = [None] * len(subs)
 
187
  all_results.append(stage_results)
188
 
189
  final_reg, sub_path, all_res = await recurse(
190
+ img, best_region, depth - 1, rows, cols, ov, pad, prog, api_key, model, all_results
191
  )
192
+
193
 
194
  return final_reg, [best_region] + sub_path, all_res
195
 
196
  # ─────────── GRADIO PIPELINE ───────────
197
+ def run_pipeline(pil_img, api_key, model, rows, cols, zoom, ov, pad, progress=gr.Progress()):
198
  """Main pipeline to process an image and find humans"""
199
+ # Input validation and error checking
200
+ error_message = None
201
+
202
+ # Check if image was provided
203
+ if pil_img is None:
204
+ error_message = "Error: Please upload an image to analyze."
205
  # Check for API key
206
+ elif not api_key or api_key.strip() == "":
207
+ error_message = "Error: OpenAI API key is required to run the search."
208
+ # Check if model is selected
209
+ elif not model or model.strip() == "":
210
+ error_message = "Error: Please select an OpenAI model."
211
+
212
+ if error_message:
213
+ return (None, None, None, error_message)
214
 
215
  # Input validation
216
+ try:
217
+ rows = max(1, min(int(rows), 10))
218
+ cols = max(1, min(int(cols), 10))
219
+ zoom = max(1, min(int(zoom), 3))
220
+ ov = max(0, min(float(ov), 0.9))
221
+ pad = max(0, min(float(pad), 0.3))
222
+ except (ValueError, TypeError):
223
+ return (None, None, None, "Error: Invalid parameter values. Using defaults instead.")
224
 
225
  with tempfile.TemporaryDirectory() as td:
226
+ try:
227
+ img_path = str(Path(td) / "in.jpg")
228
+ pil_img.save(img_path)
229
+
230
+ # Convert to RGB after reading (OpenCV reads as BGR)
231
+ img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
232
 
233
+ progress(0, desc=f"Starting recursive search using {model}...")
234
+ final_reg, path, all_results = asyncio.run(
235
+ recurse(img, (0, 0, 1, 1), zoom, rows, cols, ov, pad, progress, api_key, model)
236
+ )
237
 
238
+ # Get final crop (already in RGB)
239
+ crop_img = crop(img, final_reg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ # Collect results for each stage
242
+ stage_results = []
243
+ best_results = []
244
+ for stage in all_results:
245
+ best_idx = stage["best_idx"]
246
+ results = stage["results"]
247
+ if best_idx is not None and best_idx < len(results):
248
+ best_results.append(results[best_idx])
249
+
250
+ # Create the path visualization (already in RGB)
251
+ path_img = draw_path(img, path, best_results)
252
+
253
+ # Create visualization of all tiles in the first stage
254
+ if all_results and len(all_results) > 0:
255
+ first_stage = all_results[0]
256
+ stage_img = img.copy()
 
257
 
258
+ for i, r in enumerate(first_stage["results"]):
259
+ region = first_stage["region"] if i == first_stage["best_idx"] else None
260
+ x,y,w,h = split(region or (0,0,1,1), rows, cols, ov, pad)[i]
261
+ H,W = img.shape[:2]
262
+ x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
263
+
264
+ # Color based on detection
265
+ if r["detected"] == "YES":
266
+ color = (0, 255, 0) # Green
267
+ elif r["detected"] == "MAYBE":
268
+ color = (0, 165, 255) # Orange
269
+ else:
270
+ color = (255, 0, 0) # Red (in RGB)
271
+
272
+ # Draw rectangle with confidence
273
+ cv2.rectangle(stage_img, (x1,y1), (x2,y2), color, 1)
274
+ conf = r.get("confidence", 0)
275
+
276
+ # Add text with background
277
+ label = f"{r['detected']} ({conf:.2f})"
278
+ font = cv2.FONT_HERSHEY_SIMPLEX
279
+ font_scale = 0.4
280
+ thickness = 1
281
+ text_size = cv2.getTextSize(label, font, font_scale, thickness)[0]
282
+
283
+ # Draw background for text
284
+ cv2.rectangle(stage_img, (x1, y1-text_size[1]-5), (x1+text_size[0]+5, y1), (0,0,0), -1)
285
+ cv2.putText(stage_img, label, (x1+2, y1-5), font, font_scale, (255,255,255), thickness)
286
 
287
+ # Mark best tile with thicker border
288
+ best_idx = first_stage["best_idx"]
289
+ if best_idx is not None and best_idx < len(split((0,0,1,1), rows, cols, ov, pad)):
290
+ r = split((0,0,1,1), rows, cols, ov, pad)[best_idx]
291
+ x,y,w,h = r
292
+ H,W = img.shape[:2]
293
+ x1,y1,x2,y2 = int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)
294
+ cv2.rectangle(stage_img, (x1,y1), (x2,y2), (0,255,255), 3) # Yellow thick border
295
+ else:
296
+ stage_img = img.copy()
297
 
298
+ # Create a summary of the results
299
+ summary = []
300
+ for i, res in enumerate(best_results):
301
+ summary.append(f"Stage {i+1}: {res['detected']} ({res['confidence']:.2f}) - {res['reason']}")
302
+
303
+ summary_text = "\n".join(summary)
304
+
305
+ # Return results
306
+ return crop_img, path_img, stage_img, summary_text
307
+ except Exception as e:
308
+ # Handle any other exceptions
309
+ return (None, None, None, f"Error: {str(e)}")
 
 
 
 
 
 
 
 
310
 
311
  # ─────────── UI ───────────
312
  with gr.Blocks(title="Eagle‑Eyes Search", css="footer {visibility: hidden}") as demo:
 
335
  info="Your API key will be used only for this session and not stored"
336
  )
337
 
338
+ # Model selection dropdown
339
+ model = gr.Dropdown(
340
+ choices=AVAILABLE_MODELS,
341
+ value=DEFAULT_MODEL,
342
+ label="Model Selection",
343
+ info="Select the OpenAI model to use for analysis"
344
+ )
345
+
346
  with gr.Group():
347
  with gr.Row():
348
  row = gr.Number(value=DEFAULTS["row"], label="Grid Rows", precision=0, minimum=1, maximum=10)
 
371
  ### Tips for Best Results
372
 
373
  - **OpenAI API Key**: Required to use this tool. Your key remains private and is not stored
374
+ - **Model Selection**: Choose the appropriate OpenAI model:
375
+ - `gpt-4o`: Best overall performance for general vision tasks (recommended)
376
+ - `gpt-4o-mini`: More economical version of gpt-4o with good performance
377
+ - `o1`: Advanced reasoning model, excellent for complex analysis
378
+ - `o1-mini`: Smaller, faster version of o1
379
+ - `o3-mini`: Newest reasoning model (Jan 2025), optimized for STEM tasks
380
+ - `gpt-4-vision-preview`: Original vision model (being deprecated)
381
+ - `gpt-4-turbo`: Older model with vision capabilities
382
  - **Grid Size**: More rows/columns give better precision but require more API calls
383
  - **Zoom Levels**: More levels allow deeper searching in complex images
384
  - **Overlap**: Higher overlap prevents missing objects at tile boundaries
 
388
  """)
389
 
390
  btn.click(run_pipeline,
391
+ inputs=[img_in, api_key, model, row, col, zoom, ov, pad],
392
  outputs=[crop_out, path_out, stage_out, summary_out])
393
 
394
  if __name__ == "__main__":