Spaces:

ghadgemadhuri92
/

mathstutor

Running

App Files Files Community

ghadgemadhuri92 commited on Feb 16

Commit

1df75cb

1 Parent(s): 8bdfa24

read_image tool for text extraction from images

Browse files

Files changed (5) hide show

all_code.txt +0 -0
app/agents/adk_mathminds.py +55 -11
app/core/ocr.py +77 -85
tests/test_ocr_simple.py +37 -0
tests/test_ocr_tool.py +49 -0

all_code.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

app/agents/adk_mathminds.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 import asyncio
 import base64
@@ -13,6 +14,8 @@ from app.core.settings import settings
 from app.tools.web_scraper import WebScraper
 from app.tools.symbolic_solver import SymbolicSolver
 from app.tools.similarity_search import SimilarProblemFinder
 from app.core.math_normalizer import MathQueryNormalizer
 logger = logging.getLogger(__name__)
@@ -23,7 +26,7 @@ class MathMindsADKAgent:
     Refined to match official Multitool Agent documentation patterns.
     """
-    def __init__(self, model_name: str = "gemini-2.5-flash"):
         self.api_key = settings.GOOGLE_API_KEY
         if not self.api_key:
             logger.warning("No Google API Key found. Agent will fail.")
@@ -33,6 +36,8 @@ class MathMindsADKAgent:
         self.symbolic_solver = SymbolicSolver()
         self.normalizer = MathQueryNormalizer()
         self.similar_finder = SimilarProblemFinder()
         # Define Tools as simpler closures
         # Docs pattern: simple functions, passed in a list.
@@ -82,20 +87,51 @@ class MathMindsADKAgent:
                 formatted += f"Problem: {item.get('problem_text')}\nSolution: {item.get('solution_text')}\n---\n"
             return formatted
         # Initialize Agent
         # Using 'Agent' class as per official docs, passing functions directly.
         self.agent = Agent(
             name="math_minds_core",
             model=model_name,
-            tools=[web_search, math_solver, find_similar_problems], # Passed directly as function list
             instruction=(
                 "You are MathMinds AI, a helpful and precise mathematical assistant. "
-                "You have access to tools for solving symbolic math problems, searching the web, and finding similar solved problems. "
-                "If an image is provided, analyze it mathematically. "
-                "Use 'Math Solver' for distinct math problems (equations, calculus, etc.). "
-                "Use 'Web Search' for real-world data (prices, weather, facts). "
-                "Use 'Find Similar Problems' to look up examples if you are unsure how to solve a problem. "
-                "Always explain your steps clearly."
             )
         )
@@ -141,11 +177,19 @@ class MathMindsADKAgent:
             if image_data:
                 try:
-                    img_bytes = base64.b64decode(image_data)
-                    mime_type = "image/png"
                     if image_data.startswith("/9j/"):
                         mime_type = "image/jpeg"
                     parts.append(types.Part.from_bytes(data=img_bytes, mime_type=mime_type))
                     logger.info("Attached image to agent request.")
                 except Exception as e:

 import logging
 import asyncio
 import base64
 from app.tools.web_scraper import WebScraper
 from app.tools.symbolic_solver import SymbolicSolver
 from app.tools.similarity_search import SimilarProblemFinder
+from app.core.ocr import OCRProcessor
+from app.tools.vision_analyzer import VisionAnalyzer
 from app.core.math_normalizer import MathQueryNormalizer
 logger = logging.getLogger(__name__)
     Refined to match official Multitool Agent documentation patterns.
     """
+    def __init__(self, model_name: str = "gemini-2.5-pro"):
         self.api_key = settings.GOOGLE_API_KEY
         if not self.api_key:
             logger.warning("No Google API Key found. Agent will fail.")
         self.symbolic_solver = SymbolicSolver()
         self.normalizer = MathQueryNormalizer()
         self.similar_finder = SimilarProblemFinder()
+        self.ocr = OCRProcessor()
+        self.vision_analyzer = VisionAnalyzer()
         # Define Tools as simpler closures
         # Docs pattern: simple functions, passed in a list.
                 formatted += f"Problem: {item.get('problem_text')}\nSolution: {item.get('solution_text')}\n---\n"
             return formatted
+        def read_image(image_data: str) -> str:
+            """
+            Useful for reading text, numbers, or equations from an image when you cannot see it clearly or need the exact text.
+            Args:
+                image_data: The base64 string of the image.
+            """
+            try:
+                text = self.ocr.extract_text(image_data=image_data)
+                return text if text else "No text found in image."
+            except Exception as e:
+                return f"Error reading image: {str(e)}"
+        async def analyze_image(image_data: str, focus: str = "") -> str:
+            """
+            Analyzes an image mathematically: extracts equations, counts objects, describes graphs, etc.
+            Use this when the user uploaded an image and wants to count items or understand the visual content.
+            Args:
+                image_data: The base64 string of the image.
+                focus: Option string to focus analysis (e.g. "count red balls").
+            """
+            try:
+                result = self.vision_analyzer.analyze(image_data, focus)
+                return str(result)
+            except Exception as e:
+                return f"Image analysis failed: {str(e)}"
         # Initialize Agent
         # Using 'Agent' class as per official docs, passing functions directly.
         self.agent = Agent(
             name="math_minds_core",
             model=model_name,
+            tools=[web_search, math_solver, find_similar_problems, read_image, analyze_image], # Passed directly as function list
             instruction=(
                 "You are MathMinds AI, a helpful and precise mathematical assistant. "
+                "You can receive BOTH text instructions AND images in the same query. "
+                "When an image is provided, ALWAYS analyze it first — describe what you see, "
+                "extract equations if present, count objects if it's a probability/statistics question, "
+                "or interpret graphs/charts/diagrams mathematically. "
+                "Then combine the image analysis with the text prompt to give a complete answer. "
+                "Use tools only when necessary (e.g. 'Math Solver' for symbolic work, 'Web Search' for facts). "
+                "Use 'Read Image' to extract text from images if it's blurry or you need exact wording. "
+                "Use 'Analyze Image' to count objects or detect items. "
+                "Always explain your steps clearly and show reasoning."
             )
         )
             if image_data:
                 try:
+                    # Better MIME type detection
                     if image_data.startswith("/9j/"):
                         mime_type = "image/jpeg"
+                    elif image_data.startswith("iVBORw"):
+                        mime_type = "image/png"
+                    elif image_data.startswith("R0lGOD"):
+                        mime_type = "image/gif"
+                    elif image_data.startswith("UklGR"):
+                        mime_type = "image/webp"
+                    else:
+                        mime_type = "image/png" # Default fallback
+                    img_bytes = base64.b64decode(image_data)
                     parts.append(types.Part.from_bytes(data=img_bytes, mime_type=mime_type))
                     logger.info("Attached image to agent request.")
                 except Exception as e:

app/core/ocr.py CHANGED Viewed

@@ -1,21 +1,90 @@
 import base64
 import requests
 import io
 import logging
-from typing import Optional
-from PIL import Image, ImageEnhance, ImageOps
 logger = logging.getLogger(__name__)
 class OCRProcessor:
     """
-    Handles image validation and download.
-    Note: PaddleOCR has been removed. This class now acts as an image helper.
     """
     def __init__(self, max_size_bytes: int = 5 * 1024 * 1024): # 5MB limit
         self.max_size = max_size_bytes
-        # No OCR engine init needed
     def optimize_base64(self, b64_string: str) -> str:
         """
@@ -23,114 +92,37 @@ class OCRProcessor:
         Returns optimized base64 string.
         """
         try:
-             # Basic strip
              if ";base64," in b64_string:
-                header, data = b64_string.split(";base64,")
              else:
-                header = None
                 data = b64_string
              img_data = base64.b64decode(data)
              img = Image.open(io.BytesIO(img_data))
-             # Resize if too large
              max_dim = 1024
              if max(img.size) > max_dim:
                  img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
-             # Convert to JPEG for compression (if RGBA, convert to RGB)
              if img.mode in ('RGBA', 'P'):
                  img = img.convert('RGB')
              buffer = io.BytesIO()
-             # Quality 85 is good balance
              img.save(buffer, format="JPEG", quality=85)
              return base64.b64encode(buffer.getvalue()).decode('utf-8')
         except Exception as e:
-            logger.warning(f"Image optimization failed, using original: {e}")
             return b64_string
     def download_image_as_base64(self, url: str) -> Optional[str]:
-        """
-        Download image from URL and return as base64 string.
-        """
         try:
             response = requests.get(url, timeout=10, stream=True)
             response.raise_for_status()
-            # Size check
             if len(response.content) > self.max_size:
-                logger.warning(f"Downloaded image bytes {len(response.content)} exceed limit.")
                 return None
-            # Optimize immediately
             b64 = base64.b64encode(response.content).decode('utf-8')
             return self.optimize_base64(b64)
         except Exception as e:
             logger.error(f"Image download failed: {e}")
             return None
-    def _preprocess_image(self, img: Image.Image) -> Image.Image:
-        """
-        Applies preprocessing to improve image quality for Vision model.
-        - Grayscale conversion
-        - Contrast enhancement
-        - Binarization (Thresholding)
-        """
-        try:
-            # 1. Convert to grayscale
-            img = img.convert('L')
-            # 2. Enhance contrast
-            enhancer = ImageEnhance.Contrast(img)
-            img = enhancer.enhance(2.0)
-            # 3. Apply thresholding (binarization)
-            # This makes the image pure black and white, removing noise
-            img = img.point(lambda x: 0 if x < 128 else 255, '1')
-            return img
-        except Exception as e:
-            logger.warning(f"Image preprocessing failed, using original: {e}")
-            return img
-    def _process_image_data(self, image_bytes: bytes) -> Optional[str]:
-        """
-        Validate image format.
-        Returns dummy string or None.
-        DEPRECATED: Used to do OCR. Now just validates.
-        """
-        # 1. Size Check
-        if len(image_bytes) > self.max_size:
-            logger.warning("Image data exceeds size limit.")
-            return None
-        # 2. Format Validation (using Pillow)
-        try:
-            img = Image.open(io.BytesIO(image_bytes))
-            img.verify() # Verify it's an image
-            # Re-open for processing (verify closes the file)
-            img = Image.open(io.BytesIO(image_bytes))
-            if img.format.upper() not in ('JPEG', 'JPG', 'PNG', 'BMP', 'WEBP'):
-                 logger.warning(f"Unsupported image format: {img.format}")
-                 return None
-            return "VALID_IMAGE"
-        except Exception as e:
-             logger.warning(f"Invalid image file: {e}")
-             return None
-    # Legacy methods stubbed out or removed.
-    # process_base64 and process_url were used for text extraction.
-    # Calling them now should return None to indicate no text extracted.
-    def process_base64(self, b64_string: str) -> Optional[str]:
-         return None
-    def process_url(self, url: str) -> Optional[str]:
-         return None

 import base64
 import requests
 import io
 import logging
+import numpy as np
+from typing import Optional, List
+from PIL import Image, ImageEnhance
+try:
+    from paddleocr import PaddleOCR
+except ImportError:
+    PaddleOCR = None
 logger = logging.getLogger(__name__)
 class OCRProcessor:
     """
+    Handles OCR text extraction using PaddleOCR and image preprocessing.
     """
+    _os_instance = None # Singleton for OCR engine
     def __init__(self, max_size_bytes: int = 5 * 1024 * 1024): # 5MB limit
         self.max_size = max_size_bytes
+        self.ocr_engine = None
+    @property
+    def engine(self):
+        """Lazy load PaddleOCR engine."""
+        if self.ocr_engine is None:
+            if PaddleOCR:
+                logger.info("Initializing PaddleOCR engine...")
+                # deterministic=True ensures consistent results
+                self.ocr_engine = PaddleOCR(use_angle_cls=True, lang='en')
+            else:
+                logger.error("PaddleOCR not installed.")
+                return None
+        return self.ocr_engine
+    def extract_text(self, headers_b64: Optional[str] = None, image_data: Optional[str] = None) -> str:
+        """
+        Extract text from base64 image data.
+        Arg 'headers_b64' is for backward compat/legacy signature matching if any,
+        but we expect 'image_data' (base64 string).
+        Args:
+            image_data: Base64 string of the image.
+        Returns:
+            Extracted text string or empty string on failure.
+        """
+        # Handle positional args if someone calls extract_text(b64)
+        target_b64 = image_data or headers_b64
+        if not target_b64:
+            return ""
+        if not self.engine:
+            return "OCR Engine Unavailable"
+        try:
+            # 1. Decode Base64 to Array
+            if ";base64," in target_b64:
+                _, target_b64 = target_b64.split(";base64,")
+            img_bytes = base64.b64decode(target_b64)
+            img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+            img_arr = np.array(img)
+            # 2. Run OCR
+            result = self.engine.ocr(img_arr, cls=True)
+            # 3. Parse Results
+            extracted_lines = []
+            if result and result[0]:
+                for line in result[0]:
+                    text = line[1][0]
+                    confidence = line[1][1]
+                    if confidence > 0.5: # Confidence threshold
+                        extracted_lines.append(text)
+            full_text = "\n".join(extracted_lines)
+            logger.info(f"OCR extracted {len(full_text)} chars.")
+            return full_text
+        except Exception as e:
+            logger.error(f"OCR Failed: {e}")
+            return f"Error reading image: {e}"
     def optimize_base64(self, b64_string: str) -> str:
         """
         Returns optimized base64 string.
         """
         try:
              if ";base64," in b64_string:
+                _, data = b64_string.split(";base64,")
              else:
                 data = b64_string
              img_data = base64.b64decode(data)
              img = Image.open(io.BytesIO(img_data))
              max_dim = 1024
              if max(img.size) > max_dim:
                  img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
              if img.mode in ('RGBA', 'P'):
                  img = img.convert('RGB')
              buffer = io.BytesIO()
              img.save(buffer, format="JPEG", quality=85)
              return base64.b64encode(buffer.getvalue()).decode('utf-8')
         except Exception as e:
+            logger.warning(f"Image optimization failed: {e}")
             return b64_string
     def download_image_as_base64(self, url: str) -> Optional[str]:
+        """Download image from URL and return as base64 string."""
         try:
             response = requests.get(url, timeout=10, stream=True)
             response.raise_for_status()
             if len(response.content) > self.max_size:
                 return None
             b64 = base64.b64encode(response.content).decode('utf-8')
             return self.optimize_base64(b64)
         except Exception as e:
             logger.error(f"Image download failed: {e}")
             return None

tests/test_ocr_simple.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import sys
+import os
+sys.path.insert(0, os.getcwd())
+import base64
+import io
+from PIL import Image, ImageDraw
+from app.core.ocr import OCRProcessor
+def create_test_image_b64(text: str) -> str:
+    img = Image.new('RGB', (400, 100), color=(255, 255, 255))
+    d = ImageDraw.Draw(img)
+    d.text((10, 40), text, fill=(0, 0, 0))
+    buffer = io.BytesIO()
+    img.save(buffer, format="PNG")
+    return base64.b64encode(buffer.getvalue()).decode()
+def test_ocr_direct():
+    print("Initializing OCRProcessor...")
+    ocr = OCRProcessor()
+    text = "Hello OCR World"
+    b64 = create_test_image_b64(text)
+    print(f"Extracting text from image with '{text}'...")
+    result = ocr.extract_text(image_data=b64)
+    print(f"Result: {result}")
+    if text in result:
+        print("SUCCESS: OCR worked correctly.")
+    else:
+        print("FAILURE: Text not found.")
+if __name__ == "__main__":
+    test_ocr_direct()

tests/test_ocr_tool.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import pytest
+import asyncio
+import base64
+import io
+from PIL import Image, ImageDraw, ImageFont
+from app.agents.adk_mathminds import MathMindsADKAgent
+def create_test_image_b64(text: str) -> str:
+    """Creates a simple image with text and returns base64 string."""
+    img = Image.new('RGB', (400, 100), color=(255, 255, 255))
+    d = ImageDraw.Draw(img)
+    # default font or simple drawing
+    d.text((10, 40), text, fill=(0, 0, 0))
+    buffer = io.BytesIO()
+    img.save(buffer, format="PNG")
+    return base64.b64encode(buffer.getvalue()).decode()
+@pytest.mark.asyncio
+async def test_ocr_tool_usage():
+    """
+    Verifies that the ADK agent can use the 'read_image' tool to extract text.
+    """
+    agent = MathMindsADKAgent()
+    secret_text = "The secret number is 999."
+    image_b64 = create_test_image_b64(secret_text)
+    print("\n--- Starting OCR Tool Test ---")
+    print(f"Generated image with text: '{secret_text}'")
+    # Ask the agent to read it
+    # We specifically ask to "read the text" to encourage tool usage
+    # over just vision model (though both might work).
+    response = await agent.solve(
+        problem="What is the secret number written in this image? Use your read_image tool if needed.",
+        image_data=image_b64,
+        session_id="test_ocr_session",
+        user_id="test_user"
+    )
+    print(f"Agent Response: {response}")
+    assert "999" in response, f"Agent failed to extract the number. Response: {response}"
+    print("\nSUCCESS: OCR Tool verified!")
+if __name__ == "__main__":
+    asyncio.run(test_ocr_tool_usage())