Spaces:
Running
Running
| """ | |
| Utility functions for OCR processing with Mistral AI. | |
| Contains helper functions for working with OCR responses and image handling. | |
| """ | |
| import json | |
| import base64 | |
| import io | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Union, Any | |
| try: | |
| from PIL import Image | |
| PILLOW_AVAILABLE = True | |
| except ImportError: | |
| PILLOW_AVAILABLE = False | |
| from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk | |
| def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str: | |
| """ | |
| Replace image placeholders in markdown with base64-encoded images. | |
| Args: | |
| markdown_str: Markdown text containing image placeholders | |
| images_dict: Dictionary mapping image IDs to base64 strings | |
| Returns: | |
| Markdown text with images replaced by base64 data | |
| """ | |
| for img_name, base64_str in images_dict.items(): | |
| markdown_str = markdown_str.replace( | |
| f"", f"" | |
| ) | |
| return markdown_str | |
| def get_combined_markdown(ocr_response) -> str: | |
| """ | |
| Combine OCR text and images into a single markdown document. | |
| Ensures proper spacing between text and images. | |
| Args: | |
| ocr_response: Response from OCR processing containing text and images | |
| See https://docs.mistral.ai/capabilities/document/ for API reference | |
| Returns: | |
| Combined markdown string with embedded images | |
| """ | |
| markdowns: list[str] = [] | |
| # Extract images from page | |
| for page in ocr_response.pages: | |
| image_data = {} | |
| for img in page.images: | |
| image_data[img.id] = img.image_base64 | |
| # Replace image placeholders with actual images | |
| page_markdown = replace_images_in_markdown(page.markdown, image_data) | |
| # Ensure proper spacing between paragraphs and images | |
| # Add extra newlines between paragraphs to improve rendering | |
| page_markdown = page_markdown.replace("\n", "\n\n") | |
| # Add page separator for multi-page documents | |
| markdowns.append(page_markdown) | |
| # Join pages with clear separators for multi-page documents | |
| return "\n\n---\n\n".join(markdowns) | |
| def encode_image_for_api(image_path: Union[str, Path]) -> str: | |
| """ | |
| Encode an image as base64 for API use. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| Base64 data URL for the image | |
| """ | |
| # Convert to Path object if string | |
| image_file = Path(image_path) if isinstance(image_path, str) else image_path | |
| # Verify image exists | |
| if not image_file.is_file(): | |
| raise FileNotFoundError(f"Image file not found: {image_file}") | |
| # Encode image as base64 | |
| encoded = base64.b64encode(image_file.read_bytes()).decode() | |
| return f"data:image/jpeg;base64,{encoded}" | |
| def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"): | |
| """ | |
| Process an image with OCR and return the response. | |
| Args: | |
| client: Mistral AI client | |
| image_path: Path to the image file | |
| model: OCR model to use | |
| Returns: | |
| OCR response object | |
| """ | |
| # Encode image as base64 | |
| base64_data_url = encode_image_for_api(image_path) | |
| # Process image with OCR | |
| image_response = client.ocr.process( | |
| document=ImageURLChunk(image_url=base64_data_url), | |
| model=model | |
| ) | |
| return image_response | |
| def ocr_response_to_json(ocr_response, indent: int = 4) -> str: | |
| """ | |
| Convert OCR response to a formatted JSON string. | |
| Args: | |
| ocr_response: OCR response object | |
| indent: Indentation level for JSON formatting | |
| Returns: | |
| Formatted JSON string | |
| """ | |
| # Convert response to JSON | |
| response_dict = json.loads(ocr_response.model_dump_json()) | |
| return json.dumps(response_dict, indent=indent) | |
| def get_combined_markdown_compressed(ocr_response, max_width: int = 800, quality: int = 85) -> str: | |
| """ | |
| Combine OCR text and images into a single markdown document with compressed images. | |
| Reduces image sizes to improve performance. | |
| Args: | |
| ocr_response: Response from OCR processing containing text and images | |
| max_width: Maximum width to resize images to (preserves aspect ratio) | |
| quality: JPEG quality (0-100) for compression | |
| Returns: | |
| Combined markdown string with embedded compressed images | |
| """ | |
| if not PILLOW_AVAILABLE: | |
| # Fall back to regular method if PIL is not available | |
| return get_combined_markdown(ocr_response) | |
| markdowns: list[str] = [] | |
| # Process each page | |
| for page in ocr_response.pages: | |
| image_data = {} | |
| # Process and compress each image | |
| for img in page.images: | |
| try: | |
| # Decode base64 image | |
| img_bytes = base64.b64decode(img.image_base64.split(',')[1] if ',' in img.image_base64 else img.image_base64) | |
| # Open with PIL | |
| pil_img = Image.open(io.BytesIO(img_bytes)) | |
| # Resize if needed (maintain aspect ratio) | |
| original_width, original_height = pil_img.size | |
| if original_width > max_width: | |
| ratio = max_width / original_width | |
| new_height = int(original_height * ratio) | |
| pil_img = pil_img.resize((max_width, new_height), Image.LANCZOS) | |
| # Convert to bytes with compression | |
| buffer = io.BytesIO() | |
| format = pil_img.format if pil_img.format else 'JPEG' | |
| if format.upper() == 'JPEG' or format.upper() == 'JPG': | |
| pil_img.save(buffer, format=format, quality=quality, optimize=True) | |
| else: | |
| # For non-JPEG formats (PNG, etc.) | |
| pil_img.save(buffer, format=format, optimize=True) | |
| # Convert back to base64 | |
| compressed_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') | |
| mime_type = f"image/{format.lower()}" if format else "image/jpeg" | |
| image_data[img.id] = f"data:{mime_type};base64,{compressed_base64}" | |
| except Exception as e: | |
| # If compression fails, use original image | |
| image_data[img.id] = img.image_base64 | |
| # Replace image placeholders with compressed images | |
| page_markdown = replace_images_in_markdown(page.markdown, image_data) | |
| # Ensure proper spacing between paragraphs and images | |
| page_markdown = page_markdown.replace("\n", "\n\n") | |
| # Add page to list | |
| markdowns.append(page_markdown) | |
| # Join pages with clear separators | |
| return "\n\n---\n\n".join(markdowns) | |
| # For display in notebooks | |
| try: | |
| from IPython.display import Markdown, display | |
| def display_ocr_with_images(ocr_response): | |
| """ | |
| Display OCR response with embedded images in IPython environments. | |
| Args: | |
| ocr_response: OCR response object | |
| """ | |
| combined_markdown = get_combined_markdown(ocr_response) | |
| display(Markdown(combined_markdown)) | |
| except ImportError: | |
| # IPython not available | |
| pass |