Spaces:
Running
Running
| """ | |
| Utility functions for OCR processing with Mistral AI. | |
| Contains helper functions for working with OCR responses and image handling. | |
| """ | |
| import json | |
| import base64 | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Union | |
| from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk | |
| def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str: | |
| """ | |
| Replace image placeholders in markdown with base64-encoded images. | |
| Args: | |
| markdown_str: Markdown text containing image placeholders | |
| images_dict: Dictionary mapping image IDs to base64 strings | |
| Returns: | |
| Markdown text with images replaced by base64 data | |
| """ | |
| for img_name, base64_str in images_dict.items(): | |
| markdown_str = markdown_str.replace( | |
| f"", f"" | |
| ) | |
| return markdown_str | |
| def get_combined_markdown(ocr_response) -> str: | |
| """ | |
| Combine OCR text and images into a single markdown document. | |
| Ensures proper spacing between text and images. | |
| Args: | |
| ocr_response: Response from OCR processing containing text and images | |
| See https://docs.mistral.ai/capabilities/document/ for API reference | |
| Returns: | |
| Combined markdown string with embedded images | |
| """ | |
| markdowns: list[str] = [] | |
| # Extract images from page | |
| for page in ocr_response.pages: | |
| image_data = {} | |
| for img in page.images: | |
| image_data[img.id] = img.image_base64 | |
| # Replace image placeholders with actual images | |
| page_markdown = replace_images_in_markdown(page.markdown, image_data) | |
| # Ensure proper spacing between paragraphs and images | |
| # Add extra newlines between paragraphs to improve rendering | |
| page_markdown = page_markdown.replace("\n", "\n\n") | |
| # Add page separator for multi-page documents | |
| markdowns.append(page_markdown) | |
| # Join pages with clear separators for multi-page documents | |
| return "\n\n---\n\n".join(markdowns) | |
| def encode_image_for_api(image_path: Union[str, Path]) -> str: | |
| """ | |
| Encode an image as base64 for API use. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| Base64 data URL for the image | |
| """ | |
| # Convert to Path object if string | |
| image_file = Path(image_path) if isinstance(image_path, str) else image_path | |
| # Verify image exists | |
| if not image_file.is_file(): | |
| raise FileNotFoundError(f"Image file not found: {image_file}") | |
| # Encode image as base64 | |
| encoded = base64.b64encode(image_file.read_bytes()).decode() | |
| return f"data:image/jpeg;base64,{encoded}" | |
| def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"): | |
| """ | |
| Process an image with OCR and return the response. | |
| Args: | |
| client: Mistral AI client | |
| image_path: Path to the image file | |
| model: OCR model to use | |
| Returns: | |
| OCR response object | |
| """ | |
| # Encode image as base64 | |
| base64_data_url = encode_image_for_api(image_path) | |
| # Process image with OCR | |
| image_response = client.ocr.process( | |
| document=ImageURLChunk(image_url=base64_data_url), | |
| model=model | |
| ) | |
| return image_response | |
| def ocr_response_to_json(ocr_response, indent: int = 4) -> str: | |
| """ | |
| Convert OCR response to a formatted JSON string. | |
| Args: | |
| ocr_response: OCR response object | |
| indent: Indentation level for JSON formatting | |
| Returns: | |
| Formatted JSON string | |
| """ | |
| # Convert response to JSON | |
| response_dict = json.loads(ocr_response.model_dump_json()) | |
| return json.dumps(response_dict, indent=indent) | |
| # For display in notebooks | |
| try: | |
| from IPython.display import Markdown, display | |
| def display_ocr_with_images(ocr_response): | |
| """ | |
| Display OCR response with embedded images in IPython environments. | |
| Args: | |
| ocr_response: OCR response object | |
| """ | |
| combined_markdown = get_combined_markdown(ocr_response) | |
| display(Markdown(combined_markdown)) | |
| except ImportError: | |
| # IPython not available | |
| pass |