mathstutor / app /tools /vision_analyzer.py
ghadgemadhuri92's picture
robust performance tuning
c00b41f
import logging
import base64
import numpy as np
import cv2
import binascii
from typing import Dict, Any, Optional
logger = logging.getLogger(__name__)
class VisionAnalyzer:
"""
Applies mathematical reasoning to visual inputs using YOLO (Quantitative)
and prepares context for Gemini (Qualitative).
"""
def __init__(self, model_path: str = "yolov8n.pt"):
"""
Initialize YOLO model.
Args:
model_path: Path to YOLO weights. Defaults to nano model (will download if missing).
"""
self.model_path = model_path
self.model = None
# Color ranges in HSV
# Note: OpenCV uses H: 0-179, S: 0-255, V: 0-255
self.color_ranges = {
"red1": ((0, 70, 50), (10, 255, 255)),
"red2": ((170, 70, 50), (180, 255, 255)),
"green": ((36, 70, 50), (89, 255, 255)),
"blue": ((90, 70, 50), (128, 255, 255)),
"yellow": ((20, 100, 100), (35, 255, 255)),
"black": ((0, 0, 0), (180, 255, 30)),
"white": ((0, 0, 200), (180, 50, 255)),
"gray": ((0, 0, 50), (180, 50, 200))
}
def _ensure_model(self):
"""Lazy load the model if not already loaded."""
if self.model is None:
try:
logger.info(f"Loading YOLO model (Lazy): {self.model_path}")
from ultralytics import YOLO
self.model = YOLO(self.model_path)
except Exception as e:
logger.error(f"Failed to load YOLO model: {e}")
self.model = None
# Define aliases for Semantic Grounding
ALIAS_MAP = {
"sports ball": ["ball", "marble", "sphere", "globe", "orb"],
"bottle": ["flask", "container", "vial"],
"cup": ["mug", "glass", "tumbler"],
"book": ["notebook", "textbook", "novel"],
"vase": ["urn", "pot", "jar"] # Added vase for the marble misclassification
}
def get_canonical_name(self, detected_name: str, user_query: str) -> str:
"""
Checks if the user's query mentions an alias for a detected object.
"""
user_query_lower = user_query.lower()
# If the detected name is in the query, keep it
if detected_name in user_query_lower:
return detected_name
# Check aliases
for canonical, aliases in self.ALIAS_MAP.items():
if detected_name == canonical:
for alias in aliases:
if alias in user_query_lower:
return alias # Return the word the user actually used
return detected_name
def _detect_color(self, roi: np.ndarray) -> str:
"""
Detect dominant color in a Region of Interest (ROI).
"""
if roi.size == 0:
return "unknown"
hsv_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
max_pixels = 0
dominant_color = "unknown"
for color, (lower, upper) in self.color_ranges.items():
mask = cv2.inRange(hsv_roi, np.array(lower), np.array(upper))
count = cv2.countNonZero(mask)
# Merit handling for split red ranges
actual_color = "red" if "red" in color else color
# Store counts? Simple max strategy for now
if count > max_pixels:
max_pixels = count
dominant_color = actual_color
return dominant_color
def analyze(self, image_data: str, query: str = "") -> Dict[str, Any]:
"""
Analyze the image.
Args:
image_data: Base64 encoded image string.
query: User's query to determine intent (optional).
Returns:
Dict containing structured analysis results.
"""
result = {
"vision_mode": "qualitative", # Default to Gemini unless YOLO runs
"quantitative_analysis": None,
"status": "success"
}
self._ensure_model()
if not self.model:
return {"status": "error", "error": "Model not available"}
# Heuristic: Only run YOLO if query implies counting/detection/quantification
keywords = ["count", "how many", "number of", "calculate", "probability", "statistics", "quantify", "fraction", "percentage"]
should_run_yolo = any(k in query.lower() for k in keywords) if query else True
if not should_run_yolo:
return result
try:
# Decode image
if ";base64," in image_data:
_, image_data = image_data.split(";base64,")
try:
img_bytes = base64.b64decode(image_data)
nparr = np.frombuffer(img_bytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
if img is None:
raise ValueError("Failed to decode image (cv2 returned None)")
except (binascii.Error, ValueError) as e:
logger.error(f"Image decoding failed in VisionAnalyzer: {e}")
return {"status": "error", "error": f"Invalid image data: {str(e)}"}
# Run Interface
results = self.model(img, verbose=False)
detections = {}
confidences = []
for r in results:
for box in r.boxes:
# Confidence
conf = float(box.conf[0])
confidences.append(conf)
# Class Name
cls_id = int(box.cls[0])
raw_class_name = self.model.names[cls_id]
class_name = self.get_canonical_name(raw_class_name, query)
# Color Detection
x1, y1, x2, y2 = map(int, box.xyxy[0])
# Clamp coordinates
h, w, _ = img.shape
x1, y1 = max(0, x1), max(0, y1)
x2, y2 = min(w, x2), min(h, y2)
roi = img[y1:y2, x1:x2]
color = self._detect_color(roi)
# Key construction: e.g. "red_sports ball"
key = f"{color}_{class_name}"
if key in detections:
detections[key] += 1
else:
detections[key] = 1
if detections:
total_objects = sum(detections.values())
avg_conf = sum(confidences) / len(confidences) if confidences else 0.0
result["vision_mode"] = "quantitative"
result["quantitative_analysis"] = {
"objects": detections,
"total_objects": total_objects,
"avg_confidence": round(avg_conf, 2)
}
else:
# YOLO ran but found nothing
result["vision_mode"] = "quantitative" # It still ran
result["quantitative_analysis"] = {
"objects": {},
"total_objects": 0,
"avg_confidence": 0.0
}
except Exception as e:
logger.error(f"Vision Analysis failed: {e}")
result["status"] = "error"
result["error"] = str(e)
return result