| """ |
| VisionOCRAgent for SPARKNET |
| |
| Handles OCR and document vision tasks using Ollama's llava model. |
| Extracts text from images, PDFs, diagrams, and complex documents. |
| """ |
|
|
| import base64 |
| from pathlib import Path |
| from typing import Optional, Dict, Any |
| from loguru import logger |
| from langchain_ollama import ChatOllama |
| from langchain_core.messages import HumanMessage |
|
|
| class VisionOCRAgent: |
| """ |
| Specialized agent for vision-based OCR tasks. |
| Uses llava vision-language model for document analysis. |
| """ |
|
|
| def __init__(self, model_name: str = "llava:7b", base_url: str = "http://localhost:11434"): |
| """ |
| Initialize VisionOCRAgent. |
| |
| Args: |
| model_name: Ollama vision model to use (default: llava:7b) |
| base_url: Ollama service URL |
| """ |
| self.model_name = model_name |
| self.base_url = base_url |
|
|
| |
| self.vision_llm = ChatOllama( |
| model=model_name, |
| base_url=base_url, |
| temperature=0.1, |
| ) |
|
|
| logger.info(f"Initialized VisionOCRAgent with model: {model_name}") |
|
|
| def _encode_image(self, image_path: str) -> str: |
| """ |
| Encode image to base64 for llava. |
| |
| Args: |
| image_path: Path to image file |
| |
| Returns: |
| Base64 encoded image string |
| """ |
| with open(image_path, "rb") as image_file: |
| return base64.b64encode(image_file.read()).decode('utf-8') |
|
|
| async def extract_text_from_image( |
| self, |
| image_path: str, |
| preserve_formatting: bool = True |
| ) -> str: |
| """ |
| Extract text from an image using vision model. |
| |
| Args: |
| image_path: Path to image file |
| preserve_formatting: Whether to preserve document structure |
| |
| Returns: |
| Extracted text content |
| """ |
| logger.info(f"📷 Extracting text from: {image_path}") |
|
|
| try: |
| |
| if preserve_formatting: |
| prompt = """Extract all text from this image, preserving the original formatting and structure. |
| |
| Maintain: |
| - Paragraph breaks and line spacing |
| - Bullet points and numbered lists |
| - Section headings and hierarchy |
| - Table structures if present |
| |
| Return only the extracted text, formatted as closely as possible to the original.""" |
| else: |
| prompt = "Extract all text from this image. Return only the text content without any additional commentary." |
|
|
| |
| image_data = self._encode_image(image_path) |
|
|
| |
| message = HumanMessage( |
| content=[ |
| {"type": "text", "text": prompt}, |
| { |
| "type": "image_url", |
| "image_url": f"data:image/jpeg;base64,{image_data}" |
| } |
| ] |
| ) |
|
|
| |
| response = await self.vision_llm.ainvoke([message]) |
| extracted_text = response.content |
|
|
| logger.success(f"✅ Extracted {len(extracted_text)} characters from {Path(image_path).name}") |
| return extracted_text |
|
|
| except Exception as e: |
| logger.error(f"Failed to extract text from {image_path}: {e}") |
| raise |
|
|
| async def analyze_diagram(self, image_path: str) -> Dict[str, Any]: |
| """ |
| Analyze technical diagrams, flowcharts, and schematics. |
| |
| Args: |
| image_path: Path to diagram image |
| |
| Returns: |
| Dictionary with diagram analysis |
| """ |
| logger.info(f"📊 Analyzing diagram: {image_path}") |
|
|
| try: |
| prompt = """Analyze this technical diagram in detail. Provide: |
| |
| 1. Type of diagram (flowchart, circuit, organizational chart, etc.) |
| 2. Main components and elements |
| 3. All text labels and annotations |
| 4. Connections and relationships between elements |
| 5. Overall purpose and meaning |
| |
| Format your response as structured text.""" |
|
|
| image_data = self._encode_image(image_path) |
|
|
| message = HumanMessage( |
| content=[ |
| {"type": "text", "text": prompt}, |
| { |
| "type": "image_url", |
| "image_url": f"data:image/jpeg;base64,{image_data}" |
| } |
| ] |
| ) |
|
|
| response = await self.vision_llm.ainvoke([message]) |
| analysis = response.content |
|
|
| logger.success(f"✅ Analyzed diagram: {Path(image_path).name}") |
|
|
| return { |
| "diagram_type": "technical_diagram", |
| "analysis": analysis, |
| "source": image_path |
| } |
|
|
| except Exception as e: |
| logger.error(f"Failed to analyze diagram {image_path}: {e}") |
| raise |
|
|
| async def extract_table_data(self, image_path: str) -> str: |
| """ |
| Extract data from tables in images. |
| |
| Args: |
| image_path: Path to image containing table |
| |
| Returns: |
| Table data in markdown format |
| """ |
| logger.info(f"📋 Extracting table from: {image_path}") |
|
|
| try: |
| prompt = """Extract the table data from this image. |
| |
| Format the output as a Markdown table with proper alignment: |
| - Use | for column separators |
| - Use | --- | for header separator |
| - Maintain proper column alignment |
| - Include all rows and columns |
| |
| Example format: |
| | Header 1 | Header 2 | Header 3 | |
| | --- | --- | --- | |
| | Data 1 | Data 2 | Data 3 | |
| |
| Return ONLY the table, no additional text.""" |
|
|
| image_data = self._encode_image(image_path) |
|
|
| message = HumanMessage( |
| content=[ |
| {"type": "text", "text": prompt}, |
| { |
| "type": "image_url", |
| "image_url": f"data:image/jpeg;base64,{image_data}" |
| } |
| ] |
| ) |
|
|
| response = await self.vision_llm.ainvoke([message]) |
| table_markdown = response.content |
|
|
| logger.success(f"✅ Extracted table from {Path(image_path).name}") |
| return table_markdown |
|
|
| except Exception as e: |
| logger.error(f"Failed to extract table from {image_path}: {e}") |
| raise |
|
|
| async def analyze_patent_page(self, image_path: str) -> Dict[str, Any]: |
| """ |
| Specialized analysis for patent document pages. |
| |
| Args: |
| image_path: Path to patent page image |
| |
| Returns: |
| Dictionary with extracted patent information |
| """ |
| logger.info(f"📄 Analyzing patent page: {image_path}") |
|
|
| try: |
| prompt = """Analyze this patent document page. Extract: |
| |
| 1. Patent number or application number (if visible) |
| 2. Title or heading |
| 3. All body text (claims, descriptions, specifications) |
| 4. Figure numbers and captions |
| 5. Any diagrams or technical drawings descriptions |
| 6. Inventor names and assignee information (if visible) |
| 7. Dates (filing date, publication date, etc.) |
| |
| Preserve the structure and formatting. Return comprehensive extracted content.""" |
|
|
| image_data = self._encode_image(image_path) |
|
|
| message = HumanMessage( |
| content=[ |
| {"type": "text", "text": prompt}, |
| { |
| "type": "image_url", |
| "image_url": f"data:image/jpeg;base64,{image_data}" |
| } |
| ] |
| ) |
|
|
| response = await self.vision_llm.ainvoke([message]) |
| analysis = response.content |
|
|
| logger.success(f"✅ Analyzed patent page: {Path(image_path).name}") |
|
|
| return { |
| "page_content": analysis, |
| "source": image_path, |
| "type": "patent_page" |
| } |
|
|
| except Exception as e: |
| logger.error(f"Failed to analyze patent page {image_path}: {e}") |
| raise |
|
|
| async def identify_handwriting(self, image_path: str) -> str: |
| """ |
| Extract handwritten text from images. |
| |
| Args: |
| image_path: Path to image with handwritten content |
| |
| Returns: |
| Extracted handwritten text |
| """ |
| logger.info(f"✍️ Extracting handwriting from: {image_path}") |
|
|
| try: |
| prompt = """This image contains handwritten text. Please: |
| |
| 1. Carefully read all handwritten content |
| 2. Transcribe the text exactly as written |
| 3. Indicate [unclear] for illegible portions |
| 4. Preserve line breaks and spacing |
| 5. Note any annotations or margin notes |
| |
| Return only the transcribed text.""" |
|
|
| image_data = self._encode_image(image_path) |
|
|
| message = HumanMessage( |
| content=[ |
| {"type": "text", "text": prompt}, |
| { |
| "type": "image_url", |
| "image_url": f"data:image/jpeg;base64,{image_data}" |
| } |
| ] |
| ) |
|
|
| response = await self.vision_llm.ainvoke([message]) |
| handwriting = response.content |
|
|
| logger.success(f"✅ Extracted handwriting from {Path(image_path).name}") |
| return handwriting |
|
|
| except Exception as e: |
| logger.error(f"Failed to extract handwriting from {image_path}: {e}") |
| raise |
|
|
| def is_available(self) -> bool: |
| """ |
| Check if vision model is available. |
| |
| Returns: |
| True if model is available, False otherwise |
| """ |
| try: |
| |
| import requests |
| response = requests.get(f"{self.base_url}/api/tags") |
| if response.status_code == 200: |
| models = response.json().get("models", []) |
| return any(self.model_name in model.get("name", "") for model in models) |
| return False |
| except Exception as e: |
| logger.warning(f"Could not check model availability: {e}") |
| return False |
|
|