Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / ocr_utils.py

milwright

submit pull for merge

85bdb4e verified 11 months ago

raw

history blame

7.38 kB

	"""
	Utility functions for OCR processing with Mistral AI.
	Contains helper functions for working with OCR responses and image handling.
	"""

	import json
	import base64
	import io
	from pathlib import Path
	from typing import Dict, List, Optional, Union, Any

	try:
	from PIL import Image
	PILLOW_AVAILABLE = True
	except ImportError:
	PILLOW_AVAILABLE = False

	from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk

	def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
	"""
	Replace image placeholders in markdown with base64-encoded images.

	Args:
	markdown_str: Markdown text containing image placeholders
	images_dict: Dictionary mapping image IDs to base64 strings

	Returns:
	Markdown text with images replaced by base64 data
	"""
	for img_name, base64_str in images_dict.items():
	markdown_str = markdown_str.replace(
	f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
	)
	return markdown_str

	def get_combined_markdown(ocr_response) -> str:
	"""
	Combine OCR text and images into a single markdown document.
	Ensures proper spacing between text and images.

	Args:
	ocr_response: Response from OCR processing containing text and images
	See https://docs.mistral.ai/capabilities/document/ for API reference

	Returns:
	Combined markdown string with embedded images
	"""
	markdowns: list[str] = []
	# Extract images from page
	for page in ocr_response.pages:
	image_data = {}
	for img in page.images:
	image_data[img.id] = img.image_base64

	# Replace image placeholders with actual images
	page_markdown = replace_images_in_markdown(page.markdown, image_data)

	# Ensure proper spacing between paragraphs and images
	# Add extra newlines between paragraphs to improve rendering
	page_markdown = page_markdown.replace("\n", "\n\n")

	# Add page separator for multi-page documents
	markdowns.append(page_markdown)

	# Join pages with clear separators for multi-page documents
	return "\n\n---\n\n".join(markdowns)

	def encode_image_for_api(image_path: Union[str, Path]) -> str:
	"""
	Encode an image as base64 for API use.

	Args:
	image_path: Path to the image file

	Returns:
	Base64 data URL for the image
	"""
	# Convert to Path object if string
	image_file = Path(image_path) if isinstance(image_path, str) else image_path

	# Verify image exists
	if not image_file.is_file():
	raise FileNotFoundError(f"Image file not found: {image_file}")

	# Encode image as base64
	encoded = base64.b64encode(image_file.read_bytes()).decode()
	return f"data:image/jpeg;base64,{encoded}"

	def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
	"""
	Process an image with OCR and return the response.

	Args:
	client: Mistral AI client
	image_path: Path to the image file
	model: OCR model to use

	Returns:
	OCR response object
	"""
	# Encode image as base64
	base64_data_url = encode_image_for_api(image_path)

	# Process image with OCR
	image_response = client.ocr.process(
	document=ImageURLChunk(image_url=base64_data_url),
	model=model
	)

	return image_response

	def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
	"""
	Convert OCR response to a formatted JSON string.

	Args:
	ocr_response: OCR response object
	indent: Indentation level for JSON formatting

	Returns:
	Formatted JSON string
	"""
	# Convert response to JSON
	response_dict = json.loads(ocr_response.model_dump_json())
	return json.dumps(response_dict, indent=indent)

	def get_combined_markdown_compressed(ocr_response, max_width: int = 800, quality: int = 85) -> str:
	"""
	Combine OCR text and images into a single markdown document with compressed images.
	Reduces image sizes to improve performance.

	Args:
	ocr_response: Response from OCR processing containing text and images
	max_width: Maximum width to resize images to (preserves aspect ratio)
	quality: JPEG quality (0-100) for compression

	Returns:
	Combined markdown string with embedded compressed images
	"""
	if not PILLOW_AVAILABLE:
	# Fall back to regular method if PIL is not available
	return get_combined_markdown(ocr_response)

	markdowns: list[str] = []

	# Process each page
	for page in ocr_response.pages:
	image_data = {}

	# Process and compress each image
	for img in page.images:
	try:
	# Decode base64 image
	img_bytes = base64.b64decode(img.image_base64.split(',')[1] if ',' in img.image_base64 else img.image_base64)

	# Open with PIL
	pil_img = Image.open(io.BytesIO(img_bytes))

	# Resize if needed (maintain aspect ratio)
	original_width, original_height = pil_img.size
	if original_width > max_width:
	ratio = max_width / original_width
	new_height = int(original_height * ratio)
	pil_img = pil_img.resize((max_width, new_height), Image.LANCZOS)

	# Convert to bytes with compression
	buffer = io.BytesIO()
	format = pil_img.format if pil_img.format else 'JPEG'
	if format.upper() == 'JPEG' or format.upper() == 'JPG':
	pil_img.save(buffer, format=format, quality=quality, optimize=True)
	else:
	# For non-JPEG formats (PNG, etc.)
	pil_img.save(buffer, format=format, optimize=True)

	# Convert back to base64
	compressed_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
	mime_type = f"image/{format.lower()}" if format else "image/jpeg"
	image_data[img.id] = f"data:{mime_type};base64,{compressed_base64}"

	except Exception as e:
	# If compression fails, use original image
	image_data[img.id] = img.image_base64

	# Replace image placeholders with compressed images
	page_markdown = replace_images_in_markdown(page.markdown, image_data)

	# Ensure proper spacing between paragraphs and images
	page_markdown = page_markdown.replace("\n", "\n\n")

	# Add page to list
	markdowns.append(page_markdown)

	# Join pages with clear separators
	return "\n\n---\n\n".join(markdowns)

	# For display in notebooks
	try:
	from IPython.display import Markdown, display

	def display_ocr_with_images(ocr_response):
	"""
	Display OCR response with embedded images in IPython environments.

	Args:
	ocr_response: OCR response object
	"""
	combined_markdown = get_combined_markdown(ocr_response)
	display(Markdown(combined_markdown))
	except ImportError:
	# IPython not available
	pass