Spaces:

ghadgemadhuri92
/

mathstutor

Running

App Files Files Community

mathstutor / app /tools /vision_analyzer.py

ghadgemadhuri92

robust performance tuning

c00b41f 3 months ago

raw

history blame contribute delete

7.59 kB

	import logging
	import base64
	import numpy as np
	import cv2
	import binascii
	from typing import Dict, Any, Optional

	logger = logging.getLogger(__name__)

	class VisionAnalyzer:
	"""
	Applies mathematical reasoning to visual inputs using YOLO (Quantitative)
	and prepares context for Gemini (Qualitative).
	"""

	def __init__(self, model_path: str = "yolov8n.pt"):
	"""
	Initialize YOLO model.
	Args:
	model_path: Path to YOLO weights. Defaults to nano model (will download if missing).
	"""
	self.model_path = model_path
	self.model = None

	# Color ranges in HSV
	# Note: OpenCV uses H: 0-179, S: 0-255, V: 0-255
	self.color_ranges = {
	"red1": ((0, 70, 50), (10, 255, 255)),
	"red2": ((170, 70, 50), (180, 255, 255)),
	"green": ((36, 70, 50), (89, 255, 255)),
	"blue": ((90, 70, 50), (128, 255, 255)),
	"yellow": ((20, 100, 100), (35, 255, 255)),
	"black": ((0, 0, 0), (180, 255, 30)),
	"white": ((0, 0, 200), (180, 50, 255)),
	"gray": ((0, 0, 50), (180, 50, 200))
	}

	def _ensure_model(self):
	"""Lazy load the model if not already loaded."""
	if self.model is None:
	try:
	logger.info(f"Loading YOLO model (Lazy): {self.model_path}")
	from ultralytics import YOLO
	self.model = YOLO(self.model_path)
	except Exception as e:
	logger.error(f"Failed to load YOLO model: {e}")
	self.model = None

	# Define aliases for Semantic Grounding
	ALIAS_MAP = {
	"sports ball": ["ball", "marble", "sphere", "globe", "orb"],
	"bottle": ["flask", "container", "vial"],
	"cup": ["mug", "glass", "tumbler"],
	"book": ["notebook", "textbook", "novel"],
	"vase": ["urn", "pot", "jar"] # Added vase for the marble misclassification
	}

	def get_canonical_name(self, detected_name: str, user_query: str) -> str:
	"""
	Checks if the user's query mentions an alias for a detected object.
	"""
	user_query_lower = user_query.lower()

	# If the detected name is in the query, keep it
	if detected_name in user_query_lower:
	return detected_name

	# Check aliases
	for canonical, aliases in self.ALIAS_MAP.items():
	if detected_name == canonical:
	for alias in aliases:
	if alias in user_query_lower:
	return alias # Return the word the user actually used

	return detected_name

	def _detect_color(self, roi: np.ndarray) -> str:
	"""
	Detect dominant color in a Region of Interest (ROI).
	"""
	if roi.size == 0:
	return "unknown"

	hsv_roi = cv2.cvtColor(roi, cv2.COLOR_BGR2HSV)
	max_pixels = 0
	dominant_color = "unknown"

	for color, (lower, upper) in self.color_ranges.items():
	mask = cv2.inRange(hsv_roi, np.array(lower), np.array(upper))
	count = cv2.countNonZero(mask)

	# Merit handling for split red ranges
	actual_color = "red" if "red" in color else color

	# Store counts? Simple max strategy for now
	if count > max_pixels:
	max_pixels = count
	dominant_color = actual_color

	return dominant_color

	def analyze(self, image_data: str, query: str = "") -> Dict[str, Any]:
	"""
	Analyze the image.

	Args:
	image_data: Base64 encoded image string.
	query: User's query to determine intent (optional).

	Returns:
	Dict containing structured analysis results.
	"""
	result = {
	"vision_mode": "qualitative", # Default to Gemini unless YOLO runs
	"quantitative_analysis": None,
	"status": "success"
	}

	self._ensure_model()
	if not self.model:
	return {"status": "error", "error": "Model not available"}

	# Heuristic: Only run YOLO if query implies counting/detection/quantification
	keywords = ["count", "how many", "number of", "calculate", "probability", "statistics", "quantify", "fraction", "percentage"]
	should_run_yolo = any(k in query.lower() for k in keywords) if query else True

	if not should_run_yolo:
	return result

	try:
	# Decode image
	if ";base64," in image_data:
	_, image_data = image_data.split(";base64,")

	try:
	img_bytes = base64.b64decode(image_data)
	nparr = np.frombuffer(img_bytes, np.uint8)
	img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

	if img is None:
	raise ValueError("Failed to decode image (cv2 returned None)")
	except (binascii.Error, ValueError) as e:
	logger.error(f"Image decoding failed in VisionAnalyzer: {e}")
	return {"status": "error", "error": f"Invalid image data: {str(e)}"}

	# Run Interface
	results = self.model(img, verbose=False)

	detections = {}
	confidences = []

	for r in results:
	for box in r.boxes:
	# Confidence
	conf = float(box.conf[0])
	confidences.append(conf)

	# Class Name
	cls_id = int(box.cls[0])
	raw_class_name = self.model.names[cls_id]
	class_name = self.get_canonical_name(raw_class_name, query)

	# Color Detection
	x1, y1, x2, y2 = map(int, box.xyxy[0])
	# Clamp coordinates
	h, w, _ = img.shape
	x1, y1 = max(0, x1), max(0, y1)
	x2, y2 = min(w, x2), min(h, y2)

	roi = img[y1:y2, x1:x2]
	color = self._detect_color(roi)

	# Key construction: e.g. "red_sports ball"
	key = f"{color}_{class_name}"

	if key in detections:
	detections[key] += 1
	else:
	detections[key] = 1

	if detections:
	total_objects = sum(detections.values())
	avg_conf = sum(confidences) / len(confidences) if confidences else 0.0

	result["vision_mode"] = "quantitative"
	result["quantitative_analysis"] = {
	"objects": detections,
	"total_objects": total_objects,
	"avg_confidence": round(avg_conf, 2)
	}
	else:
	# YOLO ran but found nothing
	result["vision_mode"] = "quantitative" # It still ran
	result["quantitative_analysis"] = {
	"objects": {},
	"total_objects": 0,
	"avg_confidence": 0.0
	}

	except Exception as e:
	logger.error(f"Vision Analysis failed: {e}")
	result["status"] = "error"
	result["error"] = str(e)

	return result