baveshraam's picture
FIX: SurrealDB 2.0 migration syntax and Frontend/CORS link
f871fed
"""
OCR Service
Processes images to extract text using OCR (Optical Character Recognition).
Supports handwritten and printed text recognition.
"""
from __future__ import annotations
import base64
import io
from typing import Optional, List, Tuple, TYPE_CHECKING, Any
from datetime import datetime
from loguru import logger
from pydantic import BaseModel, Field
try:
import pytesseract
from PIL import Image
# Configure Tesseract path on Windows
import platform
import os
if platform.system() == 'Windows':
tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
logger.info(f"Checking for Tesseract at: {tesseract_path}")
if os.path.exists(tesseract_path):
pytesseract.pytesseract.tesseract_cmd = tesseract_path
logger.info(f"Set Tesseract path to: {tesseract_path}")
else:
logger.warning(f"Tesseract not found at: {tesseract_path}")
# Test if tesseract is actually available
try:
version = pytesseract.get_tesseract_version()
TESSERACT_AVAILABLE = True
logger.info(f"Tesseract version {version} is available")
except Exception as e:
TESSERACT_AVAILABLE = False
logger.warning(f"Tesseract binary not found or not working: {e}")
except ImportError as e:
TESSERACT_AVAILABLE = False
pytesseract = None
Image = None # type: ignore
logger.warning(f"pytesseract or PIL not available: {e}")
class OCRResult(BaseModel):
"""Result from OCR processing."""
raw_text: str = Field(..., description="Raw extracted text")
confidence: Optional[float] = Field(None, description="Confidence score 0-1")
word_boxes: List[dict] = Field(default_factory=list, description="Word bounding boxes")
processing_time_ms: int = Field(..., description="Processing time in milliseconds")
source_format: str = Field(..., description="Format of the source image")
class StructuredNote(BaseModel):
"""Structured note extracted from OCR text."""
title: Optional[str] = None
content: str
key_points: List[str] = Field(default_factory=list)
dates_mentioned: List[str] = Field(default_factory=list)
tags: List[str] = Field(default_factory=list)
class OCRService:
"""Service for processing images and extracting text."""
def __init__(self):
self.tesseract_available = TESSERACT_AVAILABLE
def _decode_base64_image(self, base64_string: str) -> Any:
"""Decode base64 string to PIL Image."""
# Remove data URL prefix if present
if ',' in base64_string:
base64_string = base64_string.split(',')[1]
image_data = base64.b64decode(base64_string)
return Image.open(io.BytesIO(image_data))
def _preprocess_image(self, image: Any) -> Any:
"""Preprocess image for better OCR results."""
# Convert to grayscale
if image.mode != 'L':
image = image.convert('L')
# Resize if too small
min_width = 1000
if image.width < min_width:
ratio = min_width / image.width
new_size = (int(image.width * ratio), int(image.height * ratio))
image = image.resize(new_size, Image.Resampling.LANCZOS)
return image
return image
def process_image_base64(self, base64_string: str) -> OCRResult:
"""Process base64 encoded image and extract text."""
if not self.tesseract_available:
raise RuntimeError("Tesseract is not available. Please install pytesseract and PIL.")
start_time = datetime.now()
try:
# Decode image
image = self._decode_base64_image(base64_string)
source_format = image.format or "unknown"
# Preprocess
image = self._preprocess_image(image)
# Run OCR
raw_text = pytesseract.image_to_string(image)
# Get word-level data
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
# Extract word boxes and confidence
word_boxes = []
confidences = []
for i in range(len(data['text'])):
if data['text'][i].strip():
conf = data['conf'][i]
if conf > 0: # Valid confidence
confidences.append(conf / 100.0)
word_boxes.append({
'text': data['text'][i],
'x': data['left'][i],
'y': data['top'][i],
'width': data['width'][i],
'height': data['height'][i],
'confidence': conf / 100.0
})
# Calculate average confidence
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
processing_time = int((datetime.now() - start_time).total_seconds() * 1000)
return OCRResult(
raw_text=raw_text.strip(),
confidence=avg_confidence,
word_boxes=word_boxes,
processing_time_ms=processing_time,
source_format=source_format,
)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
raise
def process_image_bytes(self, image_bytes: bytes) -> OCRResult:
"""Process image bytes and extract text."""
if not self.tesseract_available:
raise RuntimeError("Tesseract is not available. Please install pytesseract and PIL.")
start_time = datetime.now()
try:
# Open image from bytes
image = Image.open(io.BytesIO(image_bytes))
source_format = image.format or "unknown"
# Preprocess
image = self._preprocess_image(image)
# Run OCR
raw_text = pytesseract.image_to_string(image)
processing_time = int((datetime.now() - start_time).total_seconds() * 1000)
return OCRResult(
raw_text=raw_text.strip(),
confidence=None, # Simplified for bytes processing
word_boxes=[],
processing_time_ms=processing_time,
source_format=source_format,
)
except Exception as e:
logger.error(f"OCR processing failed: {e}")
raise
async def structure_text(self, raw_text: str) -> StructuredNote:
"""Use LLM to structure raw OCR text into organized notes."""
from open_notebook.graphs.utils import provision_langchain_model
if not raw_text.strip():
return StructuredNote(content="")
prompt = f"""Analyze this text extracted from a handwritten or printed note and structure it.
Raw Text:
{raw_text}
Please extract and organize:
1. A title (if one can be inferred)
2. The main content (cleaned up and organized)
3. Key points or important items (as a list)
4. Any dates mentioned
5. Relevant tags for categorization
Format your response as:
TITLE: <title or "None">
CONTENT:
<structured content>
KEY_POINTS:
- point 1
- point 2
DATES: date1, date2 (or "None")
TAGS: tag1, tag2, tag3"""
try:
model = provision_langchain_model()
response = await model.ainvoke(prompt)
response_text = response.content if hasattr(response, 'content') else str(response)
# Parse response
title = None
content = raw_text
key_points = []
dates = []
tags = []
lines = response_text.strip().split('\n')
current_section = None
content_lines = []
for line in lines:
if line.startswith('TITLE:'):
title_val = line.replace('TITLE:', '').strip()
title = title_val if title_val.lower() != 'none' else None
elif line.startswith('CONTENT:'):
current_section = 'content'
elif line.startswith('KEY_POINTS:'):
current_section = 'key_points'
elif line.startswith('DATES:'):
dates_val = line.replace('DATES:', '').strip()
if dates_val.lower() != 'none':
dates = [d.strip() for d in dates_val.split(',')]
current_section = None
elif line.startswith('TAGS:'):
tags_val = line.replace('TAGS:', '').strip()
tags = [t.strip() for t in tags_val.split(',') if t.strip()]
current_section = None
elif current_section == 'content':
content_lines.append(line)
elif current_section == 'key_points' and line.strip().startswith('-'):
key_points.append(line.strip()[1:].strip())
if content_lines:
content = '\n'.join(content_lines).strip()
return StructuredNote(
title=title,
content=content,
key_points=key_points,
dates_mentioned=dates,
tags=tags,
)
except Exception as e:
logger.error(f"Failed to structure text: {e}")
return StructuredNote(content=raw_text)
# Create singleton instance
ocr_service = OCRService()