Spaces:
Running
Running
File size: 4,249 Bytes
85bdb4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
"""
Utility functions for OCR processing with Mistral AI.
Contains helper functions for working with OCR responses and image handling.
"""
import json
import base64
from pathlib import Path
from typing import Dict, List, Optional, Union
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
"""
Replace image placeholders in markdown with base64-encoded images.
Args:
markdown_str: Markdown text containing image placeholders
images_dict: Dictionary mapping image IDs to base64 strings
Returns:
Markdown text with images replaced by base64 data
"""
for img_name, base64_str in images_dict.items():
markdown_str = markdown_str.replace(
f"", f""
)
return markdown_str
def get_combined_markdown(ocr_response) -> str:
"""
Combine OCR text and images into a single markdown document.
Ensures proper spacing between text and images.
Args:
ocr_response: Response from OCR processing containing text and images
See https://docs.mistral.ai/capabilities/document/ for API reference
Returns:
Combined markdown string with embedded images
"""
markdowns: list[str] = []
# Extract images from page
for page in ocr_response.pages:
image_data = {}
for img in page.images:
image_data[img.id] = img.image_base64
# Replace image placeholders with actual images
page_markdown = replace_images_in_markdown(page.markdown, image_data)
# Ensure proper spacing between paragraphs and images
# Add extra newlines between paragraphs to improve rendering
page_markdown = page_markdown.replace("\n", "\n\n")
# Add page separator for multi-page documents
markdowns.append(page_markdown)
# Join pages with clear separators for multi-page documents
return "\n\n---\n\n".join(markdowns)
def encode_image_for_api(image_path: Union[str, Path]) -> str:
"""
Encode an image as base64 for API use.
Args:
image_path: Path to the image file
Returns:
Base64 data URL for the image
"""
# Convert to Path object if string
image_file = Path(image_path) if isinstance(image_path, str) else image_path
# Verify image exists
if not image_file.is_file():
raise FileNotFoundError(f"Image file not found: {image_file}")
# Encode image as base64
encoded = base64.b64encode(image_file.read_bytes()).decode()
return f"data:image/jpeg;base64,{encoded}"
def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
"""
Process an image with OCR and return the response.
Args:
client: Mistral AI client
image_path: Path to the image file
model: OCR model to use
Returns:
OCR response object
"""
# Encode image as base64
base64_data_url = encode_image_for_api(image_path)
# Process image with OCR
image_response = client.ocr.process(
document=ImageURLChunk(image_url=base64_data_url),
model=model
)
return image_response
def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
"""
Convert OCR response to a formatted JSON string.
Args:
ocr_response: OCR response object
indent: Indentation level for JSON formatting
Returns:
Formatted JSON string
"""
# Convert response to JSON
response_dict = json.loads(ocr_response.model_dump_json())
return json.dumps(response_dict, indent=indent)
# For display in notebooks
try:
from IPython.display import Markdown, display
def display_ocr_with_images(ocr_response):
"""
Display OCR response with embedded images in IPython environments.
Args:
ocr_response: OCR response object
"""
combined_markdown = get_combined_markdown(ocr_response)
display(Markdown(combined_markdown))
except ImportError:
# IPython not available
pass |