Spaces:

baveshraam
/

open-notebook

Sleeping

File size: 9,780 Bytes

f871fed

"""
OCR Service

Processes images to extract text using OCR (Optical Character Recognition).
Supports handwritten and printed text recognition.
"""

from __future__ import annotations

import base64
import io
from typing import Optional, List, Tuple, TYPE_CHECKING, Any
from datetime import datetime

from loguru import logger
from pydantic import BaseModel, Field

try:
    import pytesseract
    from PIL import Image
    
    # Configure Tesseract path on Windows
    import platform
    import os
    if platform.system() == 'Windows':
        tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
        logger.info(f"Checking for Tesseract at: {tesseract_path}")
        if os.path.exists(tesseract_path):
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
            logger.info(f"Set Tesseract path to: {tesseract_path}")
        else:
            logger.warning(f"Tesseract not found at: {tesseract_path}")
    
    # Test if tesseract is actually available
    try:
        version = pytesseract.get_tesseract_version()
        TESSERACT_AVAILABLE = True
        logger.info(f"Tesseract version {version} is available")
    except Exception as e:
        TESSERACT_AVAILABLE = False
        logger.warning(f"Tesseract binary not found or not working: {e}")
except ImportError as e:
    TESSERACT_AVAILABLE = False
    pytesseract = None
    Image = None  # type: ignore
    logger.warning(f"pytesseract or PIL not available: {e}")


class OCRResult(BaseModel):
    """Result from OCR processing."""
    raw_text: str = Field(..., description="Raw extracted text")
    confidence: Optional[float] = Field(None, description="Confidence score 0-1")
    word_boxes: List[dict] = Field(default_factory=list, description="Word bounding boxes")
    processing_time_ms: int = Field(..., description="Processing time in milliseconds")
    source_format: str = Field(..., description="Format of the source image")


class StructuredNote(BaseModel):
    """Structured note extracted from OCR text."""
    title: Optional[str] = None
    content: str
    key_points: List[str] = Field(default_factory=list)
    dates_mentioned: List[str] = Field(default_factory=list)
    tags: List[str] = Field(default_factory=list)


class OCRService:
    """Service for processing images and extracting text."""

    def __init__(self):
        self.tesseract_available = TESSERACT_AVAILABLE

    def _decode_base64_image(self, base64_string: str) -> Any:
        """Decode base64 string to PIL Image."""
        # Remove data URL prefix if present
        if ',' in base64_string:
            base64_string = base64_string.split(',')[1]
        
        image_data = base64.b64decode(base64_string)
        return Image.open(io.BytesIO(image_data))

    def _preprocess_image(self, image: Any) -> Any:
        """Preprocess image for better OCR results."""
        # Convert to grayscale
        if image.mode != 'L':
            image = image.convert('L')
        
        # Resize if too small
        min_width = 1000
        if image.width < min_width:
            ratio = min_width / image.width
            new_size = (int(image.width * ratio), int(image.height * ratio))
            image = image.resize(new_size, Image.Resampling.LANCZOS)
        
        return image
        
        return image

    def process_image_base64(self, base64_string: str) -> OCRResult:
        """Process base64 encoded image and extract text."""
        if not self.tesseract_available:
            raise RuntimeError("Tesseract is not available. Please install pytesseract and PIL.")
        
        start_time = datetime.now()
        
        try:
            # Decode image
            image = self._decode_base64_image(base64_string)
            source_format = image.format or "unknown"
            
            # Preprocess
            image = self._preprocess_image(image)
            
            # Run OCR
            raw_text = pytesseract.image_to_string(image)
            
            # Get word-level data
            data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
            
            # Extract word boxes and confidence
            word_boxes = []
            confidences = []
            
            for i in range(len(data['text'])):
                if data['text'][i].strip():
                    conf = data['conf'][i]
                    if conf > 0:  # Valid confidence
                        confidences.append(conf / 100.0)
                        word_boxes.append({
                            'text': data['text'][i],
                            'x': data['left'][i],
                            'y': data['top'][i],
                            'width': data['width'][i],
                            'height': data['height'][i],
                            'confidence': conf / 100.0
                        })
            
            # Calculate average confidence
            avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
            
            processing_time = int((datetime.now() - start_time).total_seconds() * 1000)
            
            return OCRResult(
                raw_text=raw_text.strip(),
                confidence=avg_confidence,
                word_boxes=word_boxes,
                processing_time_ms=processing_time,
                source_format=source_format,
            )
            
        except Exception as e:
            logger.error(f"OCR processing failed: {e}")
            raise

    def process_image_bytes(self, image_bytes: bytes) -> OCRResult:
        """Process image bytes and extract text."""
        if not self.tesseract_available:
            raise RuntimeError("Tesseract is not available. Please install pytesseract and PIL.")
        
        start_time = datetime.now()
        
        try:
            # Open image from bytes
            image = Image.open(io.BytesIO(image_bytes))
            source_format = image.format or "unknown"
            
            # Preprocess
            image = self._preprocess_image(image)
            
            # Run OCR
            raw_text = pytesseract.image_to_string(image)
            
            processing_time = int((datetime.now() - start_time).total_seconds() * 1000)
            
            return OCRResult(
                raw_text=raw_text.strip(),
                confidence=None,  # Simplified for bytes processing
                word_boxes=[],
                processing_time_ms=processing_time,
                source_format=source_format,
            )
            
        except Exception as e:
            logger.error(f"OCR processing failed: {e}")
            raise

    async def structure_text(self, raw_text: str) -> StructuredNote:
        """Use LLM to structure raw OCR text into organized notes."""
        from open_notebook.graphs.utils import provision_langchain_model
        
        if not raw_text.strip():
            return StructuredNote(content="")
        
        prompt = f"""Analyze this text extracted from a handwritten or printed note and structure it.

Raw Text:
{raw_text}

Please extract and organize:
1. A title (if one can be inferred)
2. The main content (cleaned up and organized)
3. Key points or important items (as a list)
4. Any dates mentioned
5. Relevant tags for categorization

Format your response as:
TITLE: <title or "None">
CONTENT:
<structured content>
KEY_POINTS:
- point 1
- point 2
DATES: date1, date2 (or "None")
TAGS: tag1, tag2, tag3"""

        try:
            model = provision_langchain_model()
            response = await model.ainvoke(prompt)
            response_text = response.content if hasattr(response, 'content') else str(response)
            
            # Parse response
            title = None
            content = raw_text
            key_points = []
            dates = []
            tags = []
            
            lines = response_text.strip().split('\n')
            current_section = None
            content_lines = []
            
            for line in lines:
                if line.startswith('TITLE:'):
                    title_val = line.replace('TITLE:', '').strip()
                    title = title_val if title_val.lower() != 'none' else None
                elif line.startswith('CONTENT:'):
                    current_section = 'content'
                elif line.startswith('KEY_POINTS:'):
                    current_section = 'key_points'
                elif line.startswith('DATES:'):
                    dates_val = line.replace('DATES:', '').strip()
                    if dates_val.lower() != 'none':
                        dates = [d.strip() for d in dates_val.split(',')]
                    current_section = None
                elif line.startswith('TAGS:'):
                    tags_val = line.replace('TAGS:', '').strip()
                    tags = [t.strip() for t in tags_val.split(',') if t.strip()]
                    current_section = None
                elif current_section == 'content':
                    content_lines.append(line)
                elif current_section == 'key_points' and line.strip().startswith('-'):
                    key_points.append(line.strip()[1:].strip())
            
            if content_lines:
                content = '\n'.join(content_lines).strip()
            
            return StructuredNote(
                title=title,
                content=content,
                key_points=key_points,
                dates_mentioned=dates,
                tags=tags,
            )
            
        except Exception as e:
            logger.error(f"Failed to structure text: {e}")
            return StructuredNote(content=raw_text)


# Create singleton instance
ocr_service = OCRService()