Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

submit pull for merge

by milwright - opened Mar 18, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+11371

-10338

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.gitattributes +56 -4
.gitignore +0 -44
README.md +127 -35
__pycache__/config.cpython-312.pyc +0 -0
__pycache__/constants.cpython-312.pyc +0 -0
__pycache__/error_handler.cpython-312.pyc +0 -0
__pycache__/image_segmentation.cpython-312.pyc +0 -0
__pycache__/language_detection.cpython-312.pyc +0 -0
__pycache__/ocr_processing.cpython-312.pyc +0 -0
__pycache__/ocr_utils.cpython-312.pyc +0 -0
__pycache__/preprocessing.cpython-312.pyc +0 -0
__pycache__/process_file.cpython-312.pyc +0 -0
__pycache__/structured_ocr.cpython-312.pyc +0 -0
__pycache__/ui_components.cpython-312.pyc +0 -0
__pycache__/utils.cpython-312.pyc +0 -0
app.py +551 -554
backup/app.py +535 -0
backup/config.py +17 -0
input/magician-or-bottle-cungerer.jpg → backup/input/The Magician, or Bottle Cungerer.jpeg +0 -0
input/baldwin-15th-north.jpg → backup/input/baldwin-letter-1.jpg +0 -0
input/americae-retectio.jpg → backup/input/baldwin-letter-2.jpg +2 -2
backup/input/flier.png +0 -0
input/baldwin-letter.jpg → backup/input/letter-1.jpg +2 -2
input/gender.jpg → backup/input/letter-2.jpg +2 -2
input/photo-baldwin-letter.jpg → backup/input/letter-3.jpg +2 -2
backup/input/magellan-travels.jpg +3 -0
input/handwritten-journal.jpg → backup/input/menu.pdf +2 -2
backup/input/recipe.jpg +0 -0
backup/ocr_utils.py +136 -0
backup/pdf_ocr.py +76 -0
backup/requirements.txt +10 -0
backup/structured_ocr.py +414 -0
config.py +9 -59
constants.py +0 -193
error_handler.py +0 -65
image_segmentation.py +0 -253
input/The Magician, or Bottle Cungerer.jpeg +3 -0
input/baldwin-letter-1.jpg +3 -0
input/baldwin-letter-2.jpg +3 -0
input/flier.png +0 -0
input/harpers.pdf +0 -3
input/letter-1.jpg +3 -0
input/letter-2.jpg +3 -0
input/letter-3.jpg +3 -0
input/magician-satire.jpg +3 -0
input/menu.pdf +3 -0
input/milgram-flier.png +0 -0
input/okeefe-menu.pdf +3 -0
input/okeefe-recipe.jpg +0 -0
input/recipe.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -1,4 +1,56 @@
-*.jpg filter=lfs diff=lfs merge=lfs -text
-*.jpeg filter=lfs diff=lfs merge=lfs -text
-*.png filter=lfs diff=lfs merge=lfs -text
-*.pdf filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
+input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
+input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
+input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
+backup/input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
+backup/input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
+backup/input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
+backup/input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
+backup/input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
+backup/input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
+backup/input/menu.pdf filter=lfs diff=lfs merge=lfs -text
+backup/input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
+input/a-la-carte.pdf filter=lfs diff=lfs merge=lfs -text
+input/handwritten-letter.jpg filter=lfs diff=lfs merge=lfs -text
+input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
+input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
+input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
+input/magician-satire.jpg filter=lfs diff=lfs merge=lfs -text
+input/menu.pdf filter=lfs diff=lfs merge=lfs -text
+input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
+output/ymca-letter.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,44 +0,0 @@
-# Python bytecode
-__pycache__/
-*.py[cod]
-*.class
-# MacOS system files
-.DS_Store
-# Output and temporary files
-output/debug/
-output/comparison/
-output/segmentation_test/text_regions/
-output/preprocessing_test/
-output/batch_test/
-output/commonplace_improved/
-output/commonplace_test/
-output/preview/
-logs/
-*.backup
-*.json
-*.jpg
-*.png
-*.txt
-*.csv
-*.log
-*.zip
-*.tar
-# Test files
-test_*.py
-test_*.sh
-bug_fix_report.md
-# Input samples (large binary files)
-input/*.jpeg
-input/*.jpg
-input/*.png
-input/*.pdf
-# Temporary documents
-Tmplf6xnkgr*
-.env
-output/pipeline_test/americae-retectio/americae-retectio_comparison.jpg
-docs/environment_variables.md

README.md CHANGED Viewed

@@ -1,54 +1,146 @@
 ---
 title: Historical OCR
-emoji: ⚙️
-colorFrom: blue
-colorTo: purple
 sdk: streamlit
-sdk_version: 1.44.1
 app_file: app.py
 pinned: false
-license: gpl-3.0
-short_description: advanced OCR application for historical document analysis
 ---
-# Historical OCR
-An advanced OCR application for historical document analysis using Mistral AI.
-> **Note:** This tool is designed to assist scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating historical documents, particularly historical newspapers, handwritten documents, and photos of archival materials.
 ## Features
-- **OCR with Context:** AI-enhanced OCR optimized for historical documents
-- **Document Type Detection:** Automatically identifies handwritten letters, recipes, scientific texts, and more
-- **Advanced Image Preprocessing:**
-  - Automatic deskewing to correct document orientation
-  - Smart thresholding with Otsu and adaptive methods
-  - Morphological operations to clean up text
-  - Document-type specific optimization
-- **Custom Prompting:** Tailor the AI analysis with document-specific instructions
-- **Structured Output:** Returns organized, structured information based on document type
-## Using This App
-1. Upload a historical document (image or PDF)
-2. Add optional context or special instructions
-3. Get detailed, structured OCR results with historical context
-## Supported Document Types
-- Handwritten letters and correspondence
-- Historical recipes and cookbooks
-- Travel accounts and exploration logs
-- Scientific papers and experiments
-- Legal documents and certificates
-- Historical newspaper articles
-- General historical texts
-## Technical Details
-Built with Streamlit and Mistral AI's OCR and large language model capabilities.
----
-Created by Zach Muhlbauer, CUNY Graduate Center

 ---
 title: Historical OCR
+emoji: 📜
+colorFrom: red
+colorTo: green
 sdk: streamlit
+sdk_version: 1.43.2
 app_file: app.py
 pinned: false
+license: mit
+short_description: Employs Mistral OCR for transcribing historical data
 ---
+# Historical Document OCR
+This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
 ## Features
+- OCR processing for both image and PDF files
+- Automatic file type detection
+- Structured output generation using Mistral models
+- Interactive web interface with Streamlit
+- Supports historical documents and manuscripts
+- PDF preview functionality for better user experience
+- Smart handling of large PDFs with automatic page limiting
+- Robust error handling with helpful messages
+- Image preprocessing options for enhanced OCR accuracy
+## Project Structure
+The project is organized as follows:
+```
+Historical OCR - Project Structure
+┌─ Main Applications
+│  ├─ app.py                        # Standard Streamlit interface for OCR processing
+│  └─ streamlit_app.py              # Educational modular version with learning components
+│
+├─ Core Functionality
+│  ├─ structured_ocr.py             # Main OCR processing engine with Mistral AI integration
+│  ├─ ocr_utils.py                  # Utility functions for OCR text and image processing
+│  ├─ pdf_ocr.py                    # PDF-specific document processing functionality
+│  └─ config.py                     # Configuration settings and API keys
+│
+├─ Testing & Development
+│  ├─ simple_test.py                # Basic OCR functionality test
+│  ├─ test_pdf.py                   # PDF processing test
+│  ├─ test_pdf_preview.py           # PDF preview generation test
+│  └─ prepare_for_hf.py             # Prepare project for Hugging Face deployment
+│
+├─ Scripts
+│  ├─ run_local.sh                  # Launch standard or educational app locally
+│  ├─ run_large_files.sh            # Process large documents with optimized settings
+│  └─ setup_git.sh                  # Configure Git repositories
+│
+├─ Educational Modules (streamlit/)
+│  ├─ modules/
+│  │  ├─ module1.py                 # Introduction and Problematization
+│  │  ├─ module2.py                 # Historical Typography & OCR Challenges
+│  │  ├─ module3.py                 # Document Analysis Techniques
+│  │  ├─ module4.py                 # Processing Methods
+│  │  ├─ module5.py                 # Research Applications
+│  │  └─ module6.py                 # Future Directions
+│  │
+│  ├─ modular_app.py                # Learning module framework
+│  ├─ layout.py                     # UI components for educational interface
+│  └─ process_file.py               # File processing for educational app
+│
+├─ UI Components (ui/)
+│  └─ layout.py                     # Shared UI components and styling
+│
+├─ Data Directories
+│  ├─ input/                        # Sample documents for testing/demo
+│  └─ output/                       # Output directory for processed files
+│
+└─ Dependencies
+   ├─ requirements.txt              # Python package dependencies
+   └─ packages.txt                  # System-level dependencies
+```
+## Setup for Local Development
+1. Clone this repository
+2. Install system dependencies:
+   - For PDF processing, you need poppler:
+     - On macOS: `brew install poppler`
+     - On Ubuntu/Debian: `apt-get install poppler-utils`
+     - On Windows: Download from [poppler releases](https://github.com/oschwartz10612/poppler-windows/releases/) and add to PATH
+   - For text recognition: `tesseract-ocr`
+3. Install Python dependencies:
+```
+pip install -r requirements.txt
+```
+4. Set up your Mistral API key:
+   - Option 1: Create a `.env` file in this directory and add your Mistral API key:
+     ```
+     MISTRAL_API_KEY=your_api_key_here
+     ```
+   - Option 2: Set the `MISTRAL_API_KEY` environment variable directly:
+     ```
+     export MISTRAL_API_KEY=your_api_key_here
+     ```
+   - Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
+5. Run the Streamlit app using the script:
+```
+./run_local.sh
+```
+Or directly:
+```
+streamlit run app.py
+```
+## Usage
+1. Upload an image or PDF file using the file uploader
+2. Select processing options in the sidebar (e.g., use vision model, image preprocessing)
+3. Click "Process Document" to analyze the file
+4. View the structured results and extract information
+## Application Versions
+Two versions of the application are available:
+1. **Standard Version** (`app.py`): Focused on document processing with a clean interface
+2. **Educational Version** (`streamlit_app.py`): Enhanced with educational modules and interactive components
+To run the educational version:
+```
+streamlit run streamlit_app.py
+```
+## Deployment on Hugging Face Spaces
+This app is designed to be deployed on Hugging Face Spaces. To deploy:
+1. Fork this repository to your GitHub account or directly create a new Space on [Hugging Face](https://huggingface.co/spaces)
+2. Connect your GitHub repository to your Hugging Face Space for automatic deployment
+3. Add your Mistral API key as a secret in your Hugging Face Space settings:
+   - Secret name: `HF_MISTRAL_API_KEY`
+   - Secret value: Your Mistral API key
+The `README.md` contains the necessary configuration metadata for Hugging Face Spaces.
+Check out the configuration reference at [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces-config-reference)

__pycache__/config.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/config.cpython-312.pyc and b/__pycache__/config.cpython-312.pyc differ

__pycache__/constants.cpython-312.pyc DELETED Viewed

Binary file (11.6 kB)

__pycache__/error_handler.cpython-312.pyc DELETED Viewed

Binary file (3.2 kB)

__pycache__/image_segmentation.cpython-312.pyc DELETED Viewed

Binary file (10.6 kB)

__pycache__/language_detection.cpython-312.pyc DELETED Viewed

Binary file (18 kB)

__pycache__/ocr_processing.cpython-312.pyc DELETED Viewed

Binary file (15.5 kB)

__pycache__/ocr_utils.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/ocr_utils.cpython-312.pyc and b/__pycache__/ocr_utils.cpython-312.pyc differ

__pycache__/preprocessing.cpython-312.pyc DELETED Viewed

Binary file (9.21 kB)

__pycache__/process_file.cpython-312.pyc ADDED Viewed

Binary file (2.86 kB). View file

__pycache__/structured_ocr.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/structured_ocr.cpython-312.pyc and b/__pycache__/structured_ocr.cpython-312.pyc differ

__pycache__/ui_components.cpython-312.pyc DELETED Viewed

Binary file (44.1 kB)

__pycache__/utils.cpython-312.pyc DELETED Viewed

Binary file (14.2 kB)

app.py CHANGED Viewed

@@ -1,604 +1,601 @@
-# Standard library imports
 import os
 import json
 import sys
 import time
-import base64
-import io
-import logging
 from pathlib import Path
-from datetime import datetime
-# Configure logging
-logging.basicConfig(level=logging.INFO,
-                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Third-party imports
-import streamlit as st
-# Local application/module imports
-from preprocessing import convert_pdf_to_images, preprocess_image
-from ocr_processing import process_file
-from ui.ui_components import (
-    ProgressReporter,
-    create_sidebar_options,
-    display_results,
-    create_file_uploader,
-    display_about_tab,
-    display_previous_results,
-    display_document_with_images
-)
-from utils import get_base64_from_image, handle_temp_files, format_timestamp
-from error_handler import handle_ocr_error, check_file_size
-from constants import (
-    MAX_FILE_SIZE_MB,
-    MAX_PAGES,
-    DOCUMENT_TYPES,
-    DOCUMENT_LAYOUTS,
-    CUSTOM_PROMPT_TEMPLATES,
-    LAYOUT_PROMPT_ADDITIONS
-)
 from structured_ocr import StructuredOCR
 from config import MISTRAL_API_KEY
-from utils.image_utils import create_results_zip
-# Set favicon path
-favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
 # Set page configuration
 st.set_page_config(
     page_title="Historical OCR",
-    page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
     layout="wide",
-    initial_sidebar_state="collapsed",
 )
-# ========================================================================================
-# CRITICAL WARNING TO ALL AI MODELS AND DEVELOPERS:
-# DO NOT MODIFY THE STATE MANAGEMENT LOGIC IN THIS APPLICATION WITHOUT FULL UNDERSTANDING
-# OF STREAMLIT'S EXECUTION MODEL.
-#
-# This state management implementation has been carefully designed to work with Streamlit's
-# unique execution flow. The app runs from top to bottom on EVERY interaction, and state
-# must be explicitly managed through st.session_state.
-#
-# The current implementation uses:
-# 1. A dedicated close_document() callback function triggered by the button's on_click
-# 2. A flag-based approach (close_clicked) to handle cleanup on the next run cycle
-# 3. Early cleanup detection and st.rerun() to ensure clean UI rendering
-#
-# Previous approaches using direct state manipulation or conditional rendering based on
-# reset flags led to persistent UI elements and resource leaks.
-#
-# Consult https://docs.streamlit.io/library/advanced-features/session-state for details.
-# ========================================================================================
-def reset_document_state():
-    """Reset only document-specific state variables
-    This function explicitly resets all document-related variables to ensure
-    clean state between document processing, preventing cached data issues.
-    """
-    st.session_state.sample_document = None
-    st.session_state.original_sample_bytes = None
-    st.session_state.original_sample_name = None
-    st.session_state.original_sample_mime_type = None
-    st.session_state.is_sample_document = False
-    st.session_state.processed_document_active = False
-    st.session_state.sample_document_processed = False
-    st.session_state.sample_just_loaded = False
-    st.session_state.last_processed_file = None
-    st.session_state.selected_previous_result = None
-    # Keep temp_file_paths but ensure it's empty after cleanup
-    if 'temp_file_paths' in st.session_state:
-        st.session_state.temp_file_paths = []
-def init_session_state():
-    """Initialize session state variables if they don't already exist
-    This function follows Streamlit's recommended patterns for state initialization.
-    It only creates variables if they don't exist yet and doesn't modify existing values.
-    """
-    # Initialize persistent app state variables
-    if 'previous_results' not in st.session_state:
-        st.session_state.previous_results = []
-    if 'temp_file_paths' not in st.session_state:
-        st.session_state.temp_file_paths = []
-    if 'auto_process_sample' not in st.session_state:
-        st.session_state.auto_process_sample = False
-    if 'close_clicked' not in st.session_state:
-        st.session_state.close_clicked = False
-    if 'active_tab' not in st.session_state:
-        st.session_state.active_tab = 0
-    # Initialize document-specific state variables
-    if 'last_processed_file' not in st.session_state:
-        st.session_state.last_processed_file = None
-    if 'sample_just_loaded' not in st.session_state:
-        st.session_state.sample_just_loaded = False
-    if 'processed_document_active' not in st.session_state:
-        st.session_state.processed_document_active = False
-    if 'sample_document_processed' not in st.session_state:
-        st.session_state.sample_document_processed = False
-    if 'sample_document' not in st.session_state:
-        st.session_state.sample_document = None
-    if 'original_sample_bytes' not in st.session_state:
-        st.session_state.original_sample_bytes = None
-    if 'original_sample_name' not in st.session_state:
-        st.session_state.original_sample_name = None
-    if 'is_sample_document' not in st.session_state:
-        st.session_state.is_sample_document = False
-    if 'selected_previous_result' not in st.session_state:
-        st.session_state.selected_previous_result = None
-def close_document():
-    """Called when the Close Document button is clicked
-    This function handles proper cleanup of resources and state when closing a document.
-    It uses Streamlit's callback mechanism which ensures the state change happens
-    at the correct time in Streamlit's execution cycle.
-    WARNING: Do not replace this with inline button handling using if st.button():
-    That approach breaks Streamlit's execution flow and causes UI artifacts.
-    """
-    logger.info("Close document button clicked")
-    # Clean up temp files first
-    if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
-        logger.info(f"Cleaning up {len(st.session_state.temp_file_paths)} temporary files")
-        handle_temp_files(st.session_state.temp_file_paths)
-    # Reset all document-specific state variables to prevent caching issues
-    reset_document_state()
-    # Set flag for having cleaned up - this will trigger a rerun in main()
-    st.session_state.close_clicked = True
-def show_example_documents():
-    """Show example documents section"""
-    st.header("Sample Documents")
-    # Add a simplified info message about examples and CSS in the same markdown block
-    # to reduce spacing between elements
-    st.markdown("""
-    This app can process various historical documents:
-    - Historical photographs, maps, and manuscripts
-    - Handwritten letters and documents
-    - Printed books and articles
-    - Multi-page PDFs
-    <style>
-    /* Make the selectbox container match the full column width */
-    .main .block-container .element-container:has([data-testid="stSelectbox"]) {
-        width: 100% !important;
-        max-width: 100% !important;
-        margin-top: -12px !important; /* Reduce space between text and selectbox */
-    }
-    /* Make the actual selectbox control take the full width */
-    .stSelectbox > div > div {
-        width: 100% !important;
-        max-width: 100% !important;
-    }
-    /* Tighten spacing in the sample documents tab */
-    .main .block-container [data-testid="stVerticalBlock"] > div:nth-child(n+2) {
-        margin-top: 0.5rem !important;
-    }
-    </style>
-    """, unsafe_allow_html=True)
-    # Sample document URLs dropdown with clearer label
-    sample_urls = [
-        "Select a sample document",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/a-la-carte.pdf",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magician-or-bottle-cungerer.jpg",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
-        "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/recipe.jpg",
-    ]
-    sample_names = [
-        "Select a sample document",
-        "Restaurant Menu (PDF)",
-        "The Magician (Image)",
-        "Handwritten Letter (Image)",
-        "Magellan Travels (Image)",
-        "Milgram Flier (Image)",
-        "Historical Recipe (Image)"
-        ]
-    # Initialize sample_document in session state if it doesn't exist
-    if 'sample_document' not in st.session_state:
-        st.session_state.sample_document = None
-    selected_sample = st.selectbox("Select a sample document from `~/input`", options=range(len(sample_urls)), format_func=lambda i: sample_names[i])
-    if selected_sample > 0:
-        selected_url = sample_urls[selected_sample]
-        # Add process button for the sample document with consistent styling
-        if st.button("Load Sample Document", key="load_sample_btn"):
-            try:
-                import requests
-                from io import BytesIO
-                with st.spinner(f"Downloading {sample_names[selected_sample]}..."):
-                    response = requests.get(selected_url)
-                    response.raise_for_status()
-                    # Extract filename from URL
-                    file_name = selected_url.split("/")[-1]
-                    # Create a BytesIO object from the downloaded content
-                    file_content = BytesIO(response.content)
-                    # Store as a UploadedFile-like object in session state
-                    class SampleDocument:
-                        def __init__(self, name, content, content_type):
-                            self.name = name
-                            self._content = content
-                            self.type = content_type
-                            self.size = len(content)
-                        def getvalue(self):
-                            return self._content
-                        def read(self):
-                            return self._content
-                        def seek(self, position):
-                            # Implement seek for compatibility with some file operations
-                            return
-                        def tell(self):
-                            # Implement tell for compatibility
-                            return 0
-                    # Determine content type based on file extension
-                    if file_name.lower().endswith('.pdf'):
-                        content_type = 'application/pdf'
-                    elif file_name.lower().endswith(('.jpg', '.jpeg')):
-                        content_type = 'image/jpeg'
-                    elif file_name.lower().endswith('.png'):
-                        content_type = 'image/png'
-                    else:
-                        content_type = 'application/octet-stream'
-                    # Reset any document state before loading a new sample
-                    if st.session_state.processed_document_active:
-                        # Clean up any temporary files from previous processing
-                        if st.session_state.temp_file_paths:
-                            handle_temp_files(st.session_state.temp_file_paths)
-                        # Reset all document-specific state variables
-                        reset_document_state()
-                    # Save download info in session state
-                    st.session_state.sample_document = SampleDocument(
-                        name=file_name,
-                        content=response.content,
-                        content_type=content_type
-                    )
-                    # Store original bytes for reprocessing with proper MIME type handling
-                    st.session_state.original_sample_bytes = response.content
-                    st.session_state.original_sample_name = file_name
-                    st.session_state.original_sample_mime_type = content_type
-                    # Set state flags
-                    st.session_state.sample_just_loaded = True
-                    st.session_state.is_sample_document = True
-                    # Generate a unique identifier for the sample document
-                    st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
-                    # Set a flag to show redirect message
-                    st.session_state.redirect_to_processing = True
-                    st.rerun()
-            except Exception as e:
-                st.error(f"Error downloading sample document: {str(e)}")
-                st.info("Please try uploading your own document instead.")
-    else:
-        # If no sample is selected, clear the sample document in session state
-        st.session_state.sample_document = None
-def process_document(uploaded_file, left_col, right_col, sidebar_options):
-    """Process the uploaded document and display results"""
-    if uploaded_file is None:
-        return
-    # Check file size (cap at 50MB)
-    file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
-    if file_size_mb > MAX_FILE_SIZE_MB:
-        with left_col:
-            st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is {MAX_FILE_SIZE_MB}MB.")
-        return
-    # Check if this is a new file (different from the last processed file)
-    current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
-    # Make sure last_processed_file is initialized
-    if 'last_processed_file' not in st.session_state:
-        st.session_state.last_processed_file = None
-    if st.session_state.last_processed_file != current_file_identifier:
-        # Reset processed_document_active if a new file is uploaded
-        st.session_state.processed_document_active = False
-    # Process button - flush left with similar padding as file browser
-    with left_col:
-        # Create a process button with minimal spacing to the uploader
-        st.markdown('<div style="padding: 0.2rem 0; min-width: 170px; margin-top: -10px; overflow: visible;">', unsafe_allow_html=True)
-        process_button = st.button("Process Document", key="process_document_btn")
-        st.markdown('</div>', unsafe_allow_html=True)
-        # Handle sample document recreation if needed
-        if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
-            # Recreate the uploaded file from stored bytes
-            from io import BytesIO
-            import mimetypes
-            # Determine mime type based on file extension
-            file_ext = os.path.splitext(st.session_state.original_sample_name)[1].lower()
-            if file_ext == '.pdf':
-                mime_type = 'application/pdf'
-            elif file_ext in ['.jpg', '.jpeg']:
-                mime_type = 'image/jpeg'
-            elif file_ext == '.png':
-                mime_type = 'image/png'
-            else:
-                mime_type = mimetypes.guess_type(st.session_state.original_sample_name)[0] or 'application/octet-stream'
-            # Create a synthetic file-like object with the same interface as UploadedFile
-            uploaded_file = type('obj', (object,), {
-                'name': st.session_state.original_sample_name,
-                'getvalue': lambda: st.session_state.original_sample_bytes,
-                'read': lambda: st.session_state.original_sample_bytes,
-                'seek': lambda x: None,
-                'type': mime_type
-            })
-        # Empty container for progress indicators - will be filled during processing
-        # Positioned right after the process button for better visibility
-        progress_placeholder = st.empty()
-        # Image preprocessing preview - show if image file and preprocessing options are set
-        # Remove the document active check to show preview immediately after selection
-        if (any(sidebar_options["preprocessing_options"].values()) and
-            uploaded_file.type.startswith('image/')):
-            st.markdown("**Preprocessed Preview**")
-            try:
-                # Create a container for the preview
-                with st.container():
-                    processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
-                    # Convert image to base64 and display as HTML to avoid fullscreen button
-                    img_data = base64.b64encode(processed_bytes).decode()
-                    img_html = f'<img src="data:image/jpeg;base64,{img_data}" style="width:100%; border-radius:4px;">'
-                    st.markdown(img_html, unsafe_allow_html=True)
-                    # Show preprocessing metadata in a well-formatted caption
-                    meta_items = []
-                    # Only include document type in the list if actual preprocessing is applied
-                    has_active_preprocessing = (
-                        sidebar_options["preprocessing_options"].get("grayscale", False) or
-                        sidebar_options["preprocessing_options"].get("denoise", False) or
-                        sidebar_options["preprocessing_options"].get("contrast", 0) != 0 or
-                        sidebar_options["preprocessing_options"].get("rotation", 0) != 0
-                    )
-                    # Only show document type if there's actual preprocessing being applied
-                    if has_active_preprocessing and sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
-                        meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
-                    if sidebar_options["preprocessing_options"].get("grayscale", False):
-                        meta_items.append("Grayscale")
-                    if sidebar_options["preprocessing_options"].get("denoise", False):
-                        meta_items.append("Denoise")
-                    if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
-                        meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
-                    if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
-                        meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
-                    # Only show "Applied:" if there are actual preprocessing steps
-                    if meta_items:
-                        meta_text = "Applied: " + ", ".join(meta_items)
-                        st.caption(meta_text)
-            except Exception as e:
-                st.error(f"Error in preprocessing: {str(e)}")
-                st.info("Try using grayscale preprocessing for PNG images with transparency")
-        # Container for success message (will be filled after processing)
-        metadata_placeholder = st.empty()
-    # Check if this is an auto-processing situation
-    auto_processing = st.session_state.auto_process_sample and not st.session_state.processed_document_active
-    # Show a message if auto-processing is happening
-    auto_processing_message = st.empty()
-    if auto_processing:
-        auto_processing_message.info("Automatically processing sample document...")
-    # Determine if we should process the document
-    # Either process button was clicked OR auto-processing is happening
-    should_process = process_button or auto_processing
-    if should_process:
-        # Reset auto-process flag to avoid processing on next rerun
-        if st.session_state.auto_process_sample:
-            st.session_state.auto_process_sample = False
-        # Move the progress indicator reference to just below the button
-        progress_reporter = ProgressReporter(progress_placeholder).setup()
-        try:
-            # Process the document, capturing both result and temp file paths
-            # Modified to pass existing temp_file_paths to avoid resource leaks
-            existing_temp_paths = []
-            if 'temp_file_paths' in st.session_state:
-                existing_temp_paths = st.session_state.temp_file_paths
-            result = process_file(
-                uploaded_file=uploaded_file,
-                use_vision=sidebar_options["use_vision"],
-                preprocessing_options=sidebar_options["preprocessing_options"],
-                progress_reporter=progress_reporter,
-                pdf_dpi=sidebar_options.get("pdf_dpi", 150),
-                max_pages=sidebar_options.get("max_pages", 3),
-                pdf_rotation=sidebar_options.get("pdf_rotation", 0),
-                custom_prompt=sidebar_options.get("custom_prompt", ""),
-                perf_mode=sidebar_options.get("perf_mode", "Quality"),
-                use_segmentation=sidebar_options.get("use_segmentation", False)
-            )
-            # Ensure temp_file_paths in session state is updated with any new paths
-            # This is critical for proper resource cleanup when document is closed
-            if 'has_images' in result and result['has_images']:
-                logger.info("Document has images, ensuring temp files are tracked")
-                if 'temp_file_paths' not in st.session_state:
-                    st.session_state.temp_file_paths = []
-            # Handle text-only OCR results (like the Milgram flier)
-            if ('ocr_contents' in result and
-                'raw_text' in result['ocr_contents'] and
-                len(result['ocr_contents']) <= 2 and  # Only raw_text and possibly one other field
-                'has_images' not in result):
-                logger.info("Text-only OCR detected, handling as special case")
-                # Ensure raw_text is properly formatted as markdown
-                raw_text = result['ocr_contents']['raw_text']
-                # If we don't have other structured content, set a placeholder title
-                if 'title' not in result['ocr_contents']:
-                    result['ocr_contents']['title'] = "Document Text"
-            # Display success message at the top of results, before any previews
-            with left_col:
-                # First show the success message (full width)
-                st.success("**Document processed successfully**")
-                # Then show the close button (also full width, positioned to left)
-                st.button("Close Document",
-                          key="close_document_btn",
-                          type="secondary",
-                          on_click=close_document)
-                # Add a small spacer
-                st.markdown("<div style='height: 10px;'></div>", unsafe_allow_html=True)
-            # Display results
-            display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
-            # Set processed_document_active to True when a new document is processed
-            st.session_state.processed_document_active = True
-            # Clear the auto-processing message
-            auto_processing_message.empty()
-            # Store information about this processed file to track when new files are uploaded
-            if uploaded_file is not None:
-                st.session_state.last_processed_file = current_file_identifier
-            # Store the result in the previous results list
-            # Add timestamp to result for history tracking
-            result_copy = result.copy()
-            result_copy['timestamp'] = format_timestamp()
-            # Store if this was a sample document
-            if 'is_sample_document' in st.session_state and st.session_state.is_sample_document:
-                result_copy['sample_document'] = True
-            # Add to session state, keeping the most recent 20 results
-            st.session_state.previous_results.insert(0, result_copy)
-            if len(st.session_state.previous_results) > 20:
-                st.session_state.previous_results = st.session_state.previous_results[:20]
-        except Exception as e:
-            st.error(f"Error processing document: {str(e)}")
-            # Log the error
-            import logging
-            logging.error(f"Document processing error: {str(e)}", exc_info=True)
-def main():
-    """Main application function"""
-    # Initialize session state
-    init_session_state()
-    # Handle any required cleanup at the start of execution
-    # CRITICAL: This two-phase state cleanup pattern is essential for Streamlit's execution model.
-    # When close_clicked is True, we need to restart the app's execution with a clean slate.
-    # DO NOT REMOVE OR MODIFY this pattern as it ensures proper UI cleanup.
-    if st.session_state.get('close_clicked', False):
-        # Reset the flag - cleanup has been handled
-        st.session_state.close_clicked = False
-        # Don't do anything else in this run - force a clean restart
-        st.rerun()
-    # Initialize new flag for redirecting to processing tab
-    if 'redirect_to_processing' not in st.session_state:
-        st.session_state.redirect_to_processing = False
-    # Apply custom CSS
-    from ui.layout import load_css
-    load_css()
-    # Create sidebar options
-    sidebar_options = create_sidebar_options()
-    # Create main layout with tabs - simpler, more compact approach
-    tab_names = ["Document Processing", "Sample Documents", "Learn More"]
-    main_tab1, main_tab2, main_tab3 = st.tabs(tab_names)
-    with main_tab1:
-        # Create a two-column layout for file upload and results with minimal padding
-        st.markdown('<style>.block-container{padding-top: 1rem; padding-bottom: 0;}</style>', unsafe_allow_html=True)
-        # Using a 2:3 column ratio gives more space to the results column
-        left_col, right_col = st.columns([2, 3])
-        with left_col:
-            # Create file uploader
-            uploaded_file = create_file_uploader()
-            # If a real file is uploaded, clear any sample document
-            if uploaded_file is not None and 'sample_document' in st.session_state:
-                st.session_state.sample_document = None
-                st.session_state.is_sample_document = False
-            # Check if we have a sample document loaded (only if no real file uploaded)
-            elif ('sample_document' in st.session_state and
-                st.session_state.sample_document is not None):
-                # Use the sample document instead of the uploaded file
-                uploaded_file = st.session_state.sample_document
-                # Just reset the sample document loading flags after it's been used
-                if st.session_state.sample_just_loaded:
-                    st.session_state.sample_just_loaded = False
-                    st.session_state.sample_document_processed = True
-                    st.session_state.auto_process_sample = True
-            # Only process document if available
-            if uploaded_file is not None:
-                process_document(uploaded_file, left_col, right_col, sidebar_options)
-    with main_tab2:
-        # Sample Documents tab
-        # Show redirect message if a sample was just loaded
-        if st.session_state.get('redirect_to_processing', False):
-            st.success("**Sample document loaded!** Please switch to the **Document Processing** tab to view and process it.")
-            # Clear the flag after showing the message
-            st.session_state.redirect_to_processing = False
-        show_example_documents()
-    # Previous results tab temporarily removed
-    with main_tab3:
-        # About tab
-        display_about_tab()
-# Run the application
-if __name__ == "__main__":
-    main()

 import os
+import streamlit as st
 import json
 import sys
 import time
 from pathlib import Path
+import tempfile
+import io
+from pdf2image import convert_from_bytes
+from PIL import Image, ImageEnhance, ImageFilter
+import cv2
+import numpy as np
+# Import the StructuredOCR class and config from the local files
 from structured_ocr import StructuredOCR
 from config import MISTRAL_API_KEY
+# Check for modular UI components
+try:
+    from ui.layout import tool_container, key_concept, research_question
+    MODULAR_UI = True
+except ImportError:
+    MODULAR_UI = False
 # Set page configuration
 st.set_page_config(
     page_title="Historical OCR",
+    page_icon="📜",
     layout="wide",
+    initial_sidebar_state="expanded"
 )
+# Enable caching for expensive operations
+@st.cache_data(ttl=3600, show_spinner=False)
+def convert_pdf_to_images(pdf_bytes, dpi=150):
+    """Convert PDF bytes to a list of images with caching"""
+    try:
+        return convert_from_bytes(pdf_bytes, dpi=dpi)
+    except Exception as e:
+        st.error(f"Error converting PDF: {str(e)}")
+        return []
+@st.cache_data(ttl=3600, show_spinner=False)
+def preprocess_image(image_bytes, preprocessing_options):
+    """Preprocess image with selected options"""
+    # Convert bytes to OpenCV format
+    image = Image.open(io.BytesIO(image_bytes))
+    img_array = np.array(image)
+    # Apply preprocessing based on selected options
+    if preprocessing_options.get("grayscale", False):
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+    if preprocessing_options.get("contrast", 0) != 0:
+        contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
+        image = Image.fromarray(img_array)
+        enhancer = ImageEnhance.Contrast(image)
+        image = enhancer.enhance(contrast_factor)
+        img_array = np.array(image)
+    if preprocessing_options.get("denoise", False):
+        img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
+    if preprocessing_options.get("threshold", False):
+        # Convert to grayscale if not already
+        if len(img_array.shape) == 3:
+            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img_array
+        # Apply adaptive threshold
+        binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                      cv2.THRESH_BINARY, 11, 2)
+        # Convert back to RGB
+        img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
+    # Convert back to PIL Image
+    processed_image = Image.fromarray(img_array)
+    # Convert to bytes
+    byte_io = io.BytesIO()
+    processed_image.save(byte_io, format='PNG')
+    byte_io.seek(0)
+    return byte_io.getvalue()
+# Define functions
+def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
+    """Process the uploaded file and return the OCR results
+    Args:
+        uploaded_file: The uploaded file to process
+        use_vision: Whether to use vision model
+        preprocessing_options: Dictionary of preprocessing options
+    """
+    if preprocessing_options is None:
+        preprocessing_options = {}
+    # Show progress indicator
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    status_text.text("Preparing file for processing...")
+    # Save the uploaded file to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+        tmp.write(uploaded_file.getvalue())
+        temp_path = tmp.name
+    try:
+        # Check if API key is available
+        if not MISTRAL_API_KEY:
+            # Return dummy data if no API key
+            progress_bar.progress(100)
+            status_text.empty()
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Sample Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "title": "Sample Document",
+                    "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
+                }
+            }
+        # Update progress
+        progress_bar.progress(20)
+        status_text.text("Initializing OCR processor...")
+        # Initialize OCR processor
+        processor = StructuredOCR()
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        # Apply preprocessing if needed
+        if any(preprocessing_options.values()) and file_type == "image":
+            status_text.text("Applying image preprocessing...")
+            processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+            # Save processed image to temp file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
+                proc_tmp.write(processed_bytes)
+                temp_path = proc_tmp.name
+        # Get file size in MB
+        file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+        # Check if file exceeds API limits (50 MB)
+        if file_size_mb > 50:
+            st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                "ocr_contents": {
+                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                    "partial_text": "Document could not be processed due to size limitations."
+                }
+            }
+        # Update progress
+        progress_bar.progress(40)
+        status_text.text("Processing document with OCR...")
+        # Process the file with file size information for automatic page limiting
+        # Make sure we're using the latest mistral-ocr model
+        # See https://docs.mistral.ai/capabilities/document/ for more info
+        result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
+        # Complete progress
+        progress_bar.progress(100)
+        status_text.empty()
+        return result
+    except Exception as e:
+        progress_bar.progress(100)
+        status_text.empty()
+        st.error(f"Error during processing: {str(e)}")
+        raise
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+# App title and description
+st.title("Historical Document OCR")
+st.subheader("Powered by Mistral AI")
+# Create main layout with tabs and columns
+main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
+with main_tab1:
+    # Create a two-column layout for file upload and preview
+    upload_col, preview_col = st.columns([1, 1])
+    # File uploader in the left column
+    with upload_col:
+        st.markdown("""
+        Upload an image or PDF file to get started.
+        Using the latest `mistral-ocr-latest` model for advanced document understanding.
+        """)
+        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 50MB per file")
+# Sidebar with options
+with st.sidebar:
+    st.header("Options")
+    # Model options
+    st.subheader("Model Settings")
+    use_vision = st.checkbox("Use Vision Model", value=True,
+                            help="For image files, use the vision model for improved analysis (may be slower)")
+    # Image preprocessing options (collapsible)
+    st.subheader("Image Preprocessing")
+    with st.expander("Preprocessing Options"):
+        preprocessing_options = {}
+        preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
+                                                        help="Convert image to grayscale before OCR")
+        preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
+                                                      help="Apply adaptive thresholding to enhance text")
+        preprocessing_options["denoise"] = st.checkbox("Denoise Image",
+                                                     help="Remove noise from the image")
+        preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
+                                                    help="Adjust image contrast (-5 to +5)")
+    # PDF options (collapsible)
+    st.subheader("PDF Options")
+    with st.expander("PDF Settings"):
+        pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
+                          help="Higher DPI gives better quality but slower processing")
+        max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
+                                  help="Limit number of pages to process")
+# About tab content
+with main_tab2:
+    st.markdown("""
+    ### About This Application
+    This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
+    It can process:
+    - Image files (jpg, png, etc.)
+    - PDF documents (multi-page support)
+    The extracted content is processed into structured data based on the document type, combining:
+    - Text extraction with `mistral-ocr-latest`
+    - Analysis with language models
+    - Layout preservation with images
+    View results in three formats:
+    - Structured HTML view
+    - Raw JSON (for developers)
+    - Markdown with images (preserves document layout)
+    **New Features:**
+    - Image preprocessing for better OCR quality
+    - PDF resolution and page controls
+    - Progress tracking during processing
+    """)
+with main_tab1:
+    if uploaded_file is not None:
+        # Check file size (cap at 50MB)
+        file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
+        if file_size_mb > 50:
+            with upload_col:
+                st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
+            st.stop()
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        # Display document preview in preview column
+        with preview_col:
+            st.subheader("Document Preview")
+            if file_ext == ".pdf":
+                try:
+                    # Convert first page of PDF to image for preview
+                    pdf_bytes = uploaded_file.getvalue()
+                    images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+                    if images:
+                        # Convert PIL image to bytes for Streamlit
+                        first_page = images[0]
+                        img_bytes = io.BytesIO()
+                        first_page.save(img_bytes, format='JPEG')
+                        img_bytes.seek(0)
+                        # Display the PDF preview
+                        st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+                    else:
+                        st.info(f"PDF uploaded: {uploaded_file.name}")
+                except Exception:
+                    # Simply show the file name without an error message
+                    st.info(f"PDF uploaded: {uploaded_file.name}")
+                    st.info("Click 'Process Document' to analyze the content.")
+            else:
+                st.image(uploaded_file, use_container_width=True)
+        # Add image preprocessing preview in a collapsible section if needed
+        if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
+            with st.expander("Image Preprocessing Preview"):
+                preview_cols = st.columns(2)
+                with preview_cols[0]:
+                    st.markdown("**Original Image**")
+                    st.image(uploaded_file, use_container_width=True)
+                with preview_cols[1]:
+                    st.markdown("**Preprocessed Image**")
+                    try:
+                        processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+                        st.image(io.BytesIO(processed_bytes), use_container_width=True)
+                    except Exception as e:
+                        st.error(f"Error in preprocessing: {str(e)}")
+        # Process button - flush left with similar padding as file browser
+        with upload_col:
+            process_button = st.button("Process Document", use_container_width=True)
+        # Results section
+        if process_button:
+            try:
+                # Get max_pages or default if not available
+                max_pages_value = max_pages if 'max_pages' in locals() else None
+                # Call process_file with all options
+                result = process_file(uploaded_file, use_vision, preprocessing_options)
+                # Create results tabs for better organization
+                results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
+                with results_tab1:
+                    # Create two columns for metadata and content
+                    meta_col, content_col = st.columns([1, 2])
+                    with meta_col:
+                        st.subheader("Document Metadata")
+                        st.success("**Document processed successfully**")
+                        # Display file info
+                        st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+                        # Display info if only limited pages were processed
+                        if 'limited_pages' in result:
+                            st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
+                        # Display languages if available
+                        if 'languages' in result:
+                            languages = [lang for lang in result['languages'] if lang is not None]
+                            if languages:
+                                st.write(f"**Languages:** {', '.join(languages)}")
+                        # Confidence score if available
+                        if 'confidence_score' in result:
+                            confidence = result['confidence_score']
+                            st.write(f"**OCR Confidence:** {confidence:.1%}")
+                        # Display topics if available
+                        if 'topics' in result and result['topics']:
+                            st.write(f"**Topics:** {', '.join(result['topics'])}")
+                    with content_col:
+                        st.subheader("Document Contents")
+                        if 'ocr_contents' in result:
+                            # Check if there are images in the OCR result
+                            has_images = False
+                            if 'raw_response' in result:
+                                try:
+                                    has_images = any(page.images for page in result['raw_response'].pages)
+                                except Exception:
+                                    has_images = False
+                            # Create tabs for different views
+                            if has_images:
+                                view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
+                            else:
+                                view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
+                            with view_tab1:
+                                # Display in a more user-friendly format based on the content structure
+                                html_content = ""
+                                if isinstance(result['ocr_contents'], dict):
+                                    for section, content in result['ocr_contents'].items():
+                                        if content:  # Only display non-empty sections
+                                            section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
+                                            html_content += section_title
+                                            if isinstance(content, str):
+                                                html_content += f"<p>{content}</p>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                st.markdown(content)
+                                            elif isinstance(content, list):
+                                                html_list = "<ul>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                for item in content:
+                                                    if isinstance(item, str):
+                                                        html_list += f"<li>{item}</li>"
+                                                        st.markdown(f"- {item}")
+                                                    elif isinstance(item, dict):
+                                                        html_list += f"<li>{json.dumps(item)}</li>"
+                                                        st.json(item)
+                                                html_list += "</ul>"
+                                                html_content += html_list
+                                            elif isinstance(content, dict):
+                                                html_dict = "<dl>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                for k, v in content.items():
+                                                    html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
+                                                    st.markdown(f"**{k}:** {v}")
+                                                html_dict += "</dl>"
+                                                html_content += html_dict
+                                # Add download button in a smaller section
+                                with st.expander("Export Content"):
+                                    # Alternative download button
+                                    html_bytes = html_content.encode()
+                                    st.download_button(
+                                        label="Download as HTML",
+                                        data=html_bytes,
+                                        file_name="document_content.html",
+                                        mime="text/html"
+                                    )
+                            with view_tab2:
+                                # Show the raw JSON for developers
+                                st.json(result)
+                            if has_images:
+                                with view_tab3:
+                                    # Show loading indicator while preparing images
+                                    with st.spinner("Preparing document with embedded images..."):
+                                        try:
+                                            # Import function
+                                            try:
+                                                from ocr_utils import get_combined_markdown
+                                            except ImportError:
+                                                st.error("Required module ocr_utils not found.")
+                                                st.stop()
+                                            # Check if raw_response is available
+                                            if 'raw_response' not in result:
+                                                st.warning("Raw OCR response not available. Cannot display images.")
+                                                st.stop()
+                                            # Validate the raw_response structure before processing
+                                            if not hasattr(result['raw_response'], 'pages'):
+                                                st.warning("Invalid OCR response format. Cannot display images.")
+                                                st.stop()
+                                            # Get the combined markdown with images
+                                            # Set a flag to compress images if needed
+                                            compress_images = True
+                                            max_image_width = 800  # Maximum width for images
+                                            try:
+                                                # First try to get combined markdown with compressed images
+                                                if compress_images and hasattr(result['raw_response'], 'pages'):
+                                                    from ocr_utils import get_combined_markdown_compressed
+                                                    combined_markdown = get_combined_markdown_compressed(
+                                                        result['raw_response'],
+                                                        max_width=max_image_width,
+                                                        quality=85
+                                                    )
+                                                else:
+                                                    # Fall back to regular method if compression not available
+                                                    combined_markdown = get_combined_markdown(result['raw_response'])
+                                            except (ImportError, AttributeError):
+                                                # Fall back to regular method
+                                                combined_markdown = get_combined_markdown(result['raw_response'])
+                                            if not combined_markdown or combined_markdown.strip() == "":
+                                                st.warning("No image content found in the document.")
+                                                st.stop()
+                                            # Check if there are many images that might cause loading issues
+                                            image_count = sum(len(page.images) for page in result['raw_response'].pages if hasattr(page, 'images'))
+                                            # Add warning for image-heavy documents
+                                            if image_count > 10:
+                                                st.warning(f"This document contains {image_count} images. Rendering may take longer than usual.")
+                                            # Add CSS to ensure proper spacing and handling of text and images
+                                            st.markdown("""
+                                            <style>
+                                            .markdown-text-container {
+                                                padding: 10px;
+                                                background-color: #f9f9f9;
+                                                border-radius: 5px;
+                                            }
+                                            .markdown-text-container img {
+                                                margin: 15px 0;
+                                                max-width: 100%;
+                                                border: 1px solid #ddd;
+                                                border-radius: 4px;
+                                                display: block;
+                                            }
+                                            .markdown-text-container p {
+                                                margin-bottom: 16px;
+                                                line-height: 1.6;
+                                            }
+                                            /* Add lazy loading for images to improve performance */
+                                            .markdown-text-container img {
+                                                loading: lazy;
+                                            }
+                                            </style>
+                                            """, unsafe_allow_html=True)
+                                            # For very image-heavy documents, show images in a paginated way
+                                            if image_count > 20:
+                                                # Show image content in a paginated way
+                                                st.write("Document contains many images. Showing in a paginated format:")
+                                                # Split the combined markdown by page separators
+                                                pages = combined_markdown.split("---")
+                                                # Create a page selector
+                                                page_num = st.selectbox("Select page to view:",
+                                                                     options=list(range(1, len(pages)+1)),
+                                                                     index=0)
+                                                # Display only the selected page
+                                                st.markdown(f"""
+                                                <div class="markdown-text-container">
+                                                {pages[page_num-1]}
+                                                </div>
+                                                """, unsafe_allow_html=True)
+                                                # Add note about pagination
+                                                st.info(f"Showing page {page_num} of {len(pages)}. Select a different page from the dropdown above.")
+                                            else:
+                                                # Wrap the markdown in a div with the class for styling
+                                                st.markdown(f"""
+                                                <div class="markdown-text-container">
+                                                {combined_markdown}
+                                                </div>
+                                                """, unsafe_allow_html=True)
+                                            # Add a download button for the combined content
+                                            st.download_button(
+                                                label="Download with Images (HTML)",
+                                                data=f"""
+                                                <html>
+                                                <head>
+                                                    <style>
+                                                    body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
+                                                    img {{ max-width: 100%; margin: 15px 0; }}
+                                                    </style>
+                                                </head>
+                                                <body>
+                                                {combined_markdown}
+                                                </body>
+                                                </html>
+                                                """,
+                                                file_name="document_with_images.html",
+                                                mime="text/html"
+                                            )
+                                        except Exception as e:
+                                            st.error(f"Could not display document with images: {str(e)}")
+                                            st.info("Try refreshing or processing the document again.")
+                        else:
+                            st.error("No OCR content was extracted from the document.")
+                with results_tab2:
+                    st.subheader("Raw Processing Results")
+                    st.json(result)
+            except Exception as e:
+                st.error(f"Error processing document: {str(e)}")
+    else:
+        # Display sample images in the main area when no file is uploaded
+        st.info("Upload a document to get started using the file uploader above.")
+        # Show example images in a grid
+        st.subheader("Example Documents")
+        # Add a sample images container
+        with st.container():
+            # Find sample images from the input directory to display
+            input_dir = Path(__file__).parent / "input"
+            sample_images = []
+            if input_dir.exists():
+                # Find valid jpg files (with size > 50KB to avoid placeholders)
+                sample_images = [
+                    path for path in input_dir.glob("*.jpg")
+                    if path.stat().st_size > 50000
+                ][:3]  # Limit to 3 samples
+            if sample_images:
+                columns = st.columns(3)
+                for i, img_path in enumerate(sample_images):
+                    with columns[i % 3]:
+                        try:
+                            st.image(str(img_path), caption=img_path.name, use_container_width=True)
+                        except Exception as e:
+                            st.error(f"Error loading image {img_path.name}: {str(e)}")

backup/app.py ADDED Viewed

	@@ -0,0 +1,535 @@

+import os
+import streamlit as st
+import json
+import sys
+import time
+from pathlib import Path
+import tempfile
+import io
+from pdf2image import convert_from_bytes
+from PIL import Image, ImageEnhance, ImageFilter
+import cv2
+import numpy as np
+# Import the StructuredOCR class and config from the local files
+from structured_ocr import StructuredOCR
+from config import MISTRAL_API_KEY
+# Set page configuration
+st.set_page_config(
+    page_title="Historical OCR",
+    page_icon="🚀",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Enable caching for expensive operations
+@st.cache_data(ttl=3600, show_spinner=False)
+def convert_pdf_to_images(pdf_bytes, dpi=150):
+    """Convert PDF bytes to a list of images with caching"""
+    try:
+        return convert_from_bytes(pdf_bytes, dpi=dpi)
+    except Exception as e:
+        st.error(f"Error converting PDF: {str(e)}")
+        return []
+@st.cache_data(ttl=3600, show_spinner=False)
+def preprocess_image(image_bytes, preprocessing_options):
+    """Preprocess image with selected options"""
+    # Convert bytes to OpenCV format
+    image = Image.open(io.BytesIO(image_bytes))
+    img_array = np.array(image)
+    # Apply preprocessing based on selected options
+    if preprocessing_options.get("grayscale", False):
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+    if preprocessing_options.get("contrast", 0) != 0:
+        contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
+        image = Image.fromarray(img_array)
+        enhancer = ImageEnhance.Contrast(image)
+        image = enhancer.enhance(contrast_factor)
+        img_array = np.array(image)
+    if preprocessing_options.get("denoise", False):
+        img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
+    if preprocessing_options.get("threshold", False):
+        # Convert to grayscale if not already
+        if len(img_array.shape) == 3:
+            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img_array
+        # Apply adaptive threshold
+        binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                      cv2.THRESH_BINARY, 11, 2)
+        # Convert back to RGB
+        img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
+    # Convert back to PIL Image
+    processed_image = Image.fromarray(img_array)
+    # Convert to bytes
+    byte_io = io.BytesIO()
+    processed_image.save(byte_io, format='PNG')
+    byte_io.seek(0)
+    return byte_io.getvalue()
+# Define functions
+def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
+    """Process the uploaded file and return the OCR results
+    Args:
+        uploaded_file: The uploaded file to process
+        use_vision: Whether to use vision model
+        preprocessing_options: Dictionary of preprocessing options
+    """
+    if preprocessing_options is None:
+        preprocessing_options = {}
+    # Show progress indicator
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    status_text.text("Preparing file for processing...")
+    # Save the uploaded file to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+        tmp.write(uploaded_file.getvalue())
+        temp_path = tmp.name
+    try:
+        # Check if API key is available
+        if not MISTRAL_API_KEY:
+            # Return dummy data if no API key
+            progress_bar.progress(100)
+            status_text.empty()
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Sample Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "title": "Sample Document",
+                    "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
+                }
+            }
+        # Update progress
+        progress_bar.progress(20)
+        status_text.text("Initializing OCR processor...")
+        # Initialize OCR processor
+        processor = StructuredOCR()
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        # Apply preprocessing if needed
+        if any(preprocessing_options.values()) and file_type == "image":
+            status_text.text("Applying image preprocessing...")
+            processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+            # Save processed image to temp file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
+                proc_tmp.write(processed_bytes)
+                temp_path = proc_tmp.name
+        # Get file size in MB
+        file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+        # Check if file exceeds API limits (50 MB)
+        if file_size_mb > 50:
+            st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                "ocr_contents": {
+                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                    "partial_text": "Document could not be processed due to size limitations."
+                }
+            }
+        # Update progress
+        progress_bar.progress(40)
+        status_text.text("Processing document with OCR...")
+        # Process the file with file size information for automatic page limiting
+        # Make sure we're using the latest mistral-ocr model
+        # See https://docs.mistral.ai/capabilities/document/ for more info
+        result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
+        # Complete progress
+        progress_bar.progress(100)
+        status_text.empty()
+        return result
+    except Exception as e:
+        progress_bar.progress(100)
+        status_text.empty()
+        st.error(f"Error during processing: {str(e)}")
+        raise
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+# App title and description
+st.title("Historical Document OCR")
+st.subheader("Powered by Mistral AI")
+# Create main layout with tabs and columns
+main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
+with main_tab1:
+    # Create a two-column layout for file upload and preview
+    upload_col, preview_col = st.columns([1, 1])
+    # File uploader in the left column
+    with upload_col:
+        st.markdown("""
+        Upload an image or PDF file to get started.
+        Using the latest `mistral-ocr-latest` model for advanced document understanding.
+        """)
+        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
+# Sidebar with options
+with st.sidebar:
+    st.header("Options")
+    # Model options
+    st.subheader("Model Settings")
+    use_vision = st.checkbox("Use Vision Model", value=True,
+                            help="For image files, use the vision model for improved analysis (may be slower)")
+    # Image preprocessing options (collapsible)
+    st.subheader("Image Preprocessing")
+    with st.expander("Preprocessing Options"):
+        preprocessing_options = {}
+        preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
+                                                        help="Convert image to grayscale before OCR")
+        preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
+                                                      help="Apply adaptive thresholding to enhance text")
+        preprocessing_options["denoise"] = st.checkbox("Denoise Image",
+                                                     help="Remove noise from the image")
+        preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
+                                                    help="Adjust image contrast (-5 to +5)")
+    # PDF options (collapsible)
+    st.subheader("PDF Options")
+    with st.expander("PDF Settings"):
+        pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
+                          help="Higher DPI gives better quality but slower processing")
+        max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
+                                  help="Limit number of pages to process")
+# About tab content
+with main_tab2:
+    st.markdown("""
+    ### About This Application
+    This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
+    It can process:
+    - Image files (jpg, png, etc.)
+    - PDF documents (multi-page support)
+    The extracted content is processed into structured data based on the document type, combining:
+    - Text extraction with `mistral-ocr-latest`
+    - Analysis with language models
+    - Layout preservation with images
+    View results in three formats:
+    - Structured HTML view
+    - Raw JSON (for developers)
+    - Markdown with images (preserves document layout)
+    **New Features:**
+    - Image preprocessing for better OCR quality
+    - PDF resolution and page controls
+    - Progress tracking during processing
+    """)
+with main_tab1:
+    if uploaded_file is not None:
+        # Check file size (cap at 50MB)
+        file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
+        if file_size_mb > 50:
+            with upload_col:
+                st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
+            st.stop()
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        # Display document preview in preview column
+        with preview_col:
+            st.subheader("Document Preview")
+            if file_ext == ".pdf":
+                try:
+                    # Convert first page of PDF to image for preview
+                    pdf_bytes = uploaded_file.getvalue()
+                    images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+                    if images:
+                        # Convert PIL image to bytes for Streamlit
+                        first_page = images[0]
+                        img_bytes = io.BytesIO()
+                        first_page.save(img_bytes, format='JPEG')
+                        img_bytes.seek(0)
+                        # Display the PDF preview
+                        st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+                    else:
+                        st.info(f"PDF uploaded: {uploaded_file.name}")
+                except Exception:
+                    # Simply show the file name without an error message
+                    st.info(f"PDF uploaded: {uploaded_file.name}")
+                    st.info("Click 'Process Document' to analyze the content.")
+            else:
+                st.image(uploaded_file, use_container_width=True)
+        # Add image preprocessing preview in a collapsible section if needed
+        if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
+            with st.expander("Image Preprocessing Preview"):
+                preview_cols = st.columns(2)
+                with preview_cols[0]:
+                    st.markdown("**Original Image**")
+                    st.image(uploaded_file, use_container_width=True)
+                with preview_cols[1]:
+                    st.markdown("**Preprocessed Image**")
+                    try:
+                        processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+                        st.image(io.BytesIO(processed_bytes), use_container_width=True)
+                    except Exception as e:
+                        st.error(f"Error in preprocessing: {str(e)}")
+        # Process button - flush left with similar padding as file browser
+        with upload_col:
+            process_button = st.button("Process Document", use_container_width=True)
+        # Results section
+        if process_button:
+            try:
+                # Get max_pages or default if not available
+                max_pages_value = max_pages if 'max_pages' in locals() else None
+                # Call process_file with all options
+                result = process_file(uploaded_file, use_vision, preprocessing_options)
+                # Create results tabs for better organization
+                results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
+                with results_tab1:
+                    # Create two columns for metadata and content
+                    meta_col, content_col = st.columns([1, 2])
+                    with meta_col:
+                        st.subheader("Document Metadata")
+                        st.success("**Document processed successfully**")
+                        # Display file info
+                        st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+                        # Display info if only limited pages were processed
+                        if 'limited_pages' in result:
+                            st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
+                        # Display languages if available
+                        if 'languages' in result:
+                            languages = [lang for lang in result['languages'] if lang is not None]
+                            if languages:
+                                st.write(f"**Languages:** {', '.join(languages)}")
+                        # Confidence score if available
+                        if 'confidence_score' in result:
+                            confidence = result['confidence_score']
+                            st.write(f"**OCR Confidence:** {confidence:.1%}")
+                        # Display topics if available
+                        if 'topics' in result and result['topics']:
+                            st.write(f"**Topics:** {', '.join(result['topics'])}")
+                    with content_col:
+                        st.subheader("Document Contents")
+                        if 'ocr_contents' in result:
+                            # Check if there are images in the OCR result
+                            has_images = False
+                            if 'raw_response' in result:
+                                try:
+                                    has_images = any(page.images for page in result['raw_response'].pages)
+                                except Exception:
+                                    has_images = False
+                            # Create tabs for different views
+                            if has_images:
+                                view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
+                            else:
+                                view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
+                            with view_tab1:
+                                # Display in a more user-friendly format based on the content structure
+                                html_content = ""
+                                if isinstance(result['ocr_contents'], dict):
+                                    for section, content in result['ocr_contents'].items():
+                                        if content:  # Only display non-empty sections
+                                            section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
+                                            html_content += section_title
+                                            if isinstance(content, str):
+                                                html_content += f"<p>{content}</p>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                st.markdown(content)
+                                            elif isinstance(content, list):
+                                                html_list = "<ul>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                for item in content:
+                                                    if isinstance(item, str):
+                                                        html_list += f"<li>{item}</li>"
+                                                        st.markdown(f"- {item}")
+                                                    elif isinstance(item, dict):
+                                                        html_list += f"<li>{json.dumps(item)}</li>"
+                                                        st.json(item)
+                                                html_list += "</ul>"
+                                                html_content += html_list
+                                            elif isinstance(content, dict):
+                                                html_dict = "<dl>"
+                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                                for k, v in content.items():
+                                                    html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
+                                                    st.markdown(f"**{k}:** {v}")
+                                                html_dict += "</dl>"
+                                                html_content += html_dict
+                                # Add download button in a smaller section
+                                with st.expander("Export Content"):
+                                    # Alternative download button
+                                    html_bytes = html_content.encode()
+                                    st.download_button(
+                                        label="Download as HTML",
+                                        data=html_bytes,
+                                        file_name="document_content.html",
+                                        mime="text/html"
+                                    )
+                            with view_tab2:
+                                # Show the raw JSON for developers
+                                st.json(result)
+                            if has_images:
+                                with view_tab3:
+                                    # Show loading indicator while preparing images
+                                    with st.spinner("Preparing document with embedded images..."):
+                                        try:
+                                            # Import function
+                                            try:
+                                                from ocr_utils import get_combined_markdown
+                                            except ImportError:
+                                                st.error("Required module ocr_utils not found.")
+                                                st.stop()
+                                            # Check if raw_response is available
+                                            if 'raw_response' not in result:
+                                                st.warning("Raw OCR response not available. Cannot display images.")
+                                                st.stop()
+                                            # Validate the raw_response structure before processing
+                                            if not hasattr(result['raw_response'], 'pages'):
+                                                st.warning("Invalid OCR response format. Cannot display images.")
+                                                st.stop()
+                                            # Get the combined markdown with images
+                                            combined_markdown = get_combined_markdown(result['raw_response'])
+                                            if not combined_markdown or combined_markdown.strip() == "":
+                                                st.warning("No image content found in the document.")
+                                                st.stop()
+                                            # Add CSS to ensure proper spacing and handling of text and images
+                                            st.markdown("""
+                                            <style>
+                                            .markdown-text-container {
+                                                padding: 10px;
+                                                background-color: #f9f9f9;
+                                                border-radius: 5px;
+                                            }
+                                            .markdown-text-container img {
+                                                margin: 15px 0;
+                                                max-width: 100%;
+                                                border: 1px solid #ddd;
+                                                border-radius: 4px;
+                                                display: block;
+                                            }
+                                            .markdown-text-container p {
+                                                margin-bottom: 16px;
+                                                line-height: 1.6;
+                                            }
+                                            </style>
+                                            """, unsafe_allow_html=True)
+                                            # Wrap the markdown in a div with the class for styling
+                                            st.markdown(f"""
+                                            <div class="markdown-text-container">
+                                            {combined_markdown}
+                                            </div>
+                                            """, unsafe_allow_html=True)
+                                            # Add a download button for the combined content
+                                            st.download_button(
+                                                label="Download with Images (HTML)",
+                                                data=f"""
+                                                <html>
+                                                <head>
+                                                    <style>
+                                                    body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
+                                                    img {{ max-width: 100%; margin: 15px 0; }}
+                                                    </style>
+                                                </head>
+                                                <body>
+                                                {combined_markdown}
+                                                </body>
+                                                </html>
+                                                """,
+                                                file_name="document_with_images.html",
+                                                mime="text/html"
+                                            )
+                                        except Exception as e:
+                                            st.error(f"Could not display document with images: {str(e)}")
+                                            st.info("Try refreshing or processing the document again.")
+                        else:
+                            st.error("No OCR content was extracted from the document.")
+                with results_tab2:
+                    st.subheader("Raw Processing Results")
+                    st.json(result)
+            except Exception as e:
+                st.error(f"Error processing document: {str(e)}")
+    else:
+        # Display sample images in the main area when no file is uploaded
+        st.info("Upload a document to get started using the file uploader above.")
+        # Show example images in a grid
+        st.subheader("Example Documents")
+        # Add a sample images container
+        with st.container():
+            # Find sample images from the input directory to display
+            input_dir = Path(__file__).parent / "input"
+            sample_images = []
+            if input_dir.exists():
+                sample_images = list(input_dir.glob("*.jpg"))[:3]  # Limit to 3 samples
+            if sample_images:
+                columns = st.columns(3)
+                for i, img_path in enumerate(sample_images):
+                    with columns[i % 3]:
+                        st.image(str(img_path), caption=img_path.name, use_container_width=True)

backup/config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# config.py
+"""
+Configuration file for Mistral OCR processing.
+Contains API key and other settings.
+"""
+import os
+# Your Mistral API key - get from Hugging Face secrets or environment variable
+# The priority order is: HF_SPACES environment var > regular environment var > empty string
+# Note: No default API key is provided for security reasons
+MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",  # First check HF-specific env var
+                  os.environ.get("MISTRAL_API_KEY", ""))  # Then check regular env var
+# Model settings
+OCR_MODEL = "mistral-ocr-latest"
+TEXT_MODEL = "ministral-8b-latest"
+VISION_MODEL = "pixtral-12b-latest"

input/magician-or-bottle-cungerer.jpg → backup/input/The Magician, or Bottle Cungerer.jpeg RENAMED Viewed

File without changes

input/baldwin-15th-north.jpg → backup/input/baldwin-letter-1.jpg RENAMED Viewed

File without changes

input/americae-retectio.jpg → backup/input/baldwin-letter-2.jpg RENAMED Viewed

File without changes

backup/input/flier.png ADDED Viewed

input/baldwin-letter.jpg → backup/input/letter-1.jpg RENAMED Viewed

File without changes

input/gender.jpg → backup/input/letter-2.jpg RENAMED Viewed

File without changes

input/photo-baldwin-letter.jpg → backup/input/letter-3.jpg RENAMED Viewed

File without changes

backup/input/magellan-travels.jpg ADDED Viewed

Git LFS Details

SHA256: ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
Pointer size: 131 Bytes
Size of remote file: 283 kB

input/handwritten-journal.jpg → backup/input/menu.pdf RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:279f7c915ae54aafb30e6d70e480eb74e73b6aa92de20f60cd13019e9debbb62
-size 1459485

 version https://git-lfs.github.com/spec/v1
+oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
+size 2554815

backup/input/recipe.jpg ADDED Viewed

backup/ocr_utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""
+Utility functions for OCR processing with Mistral AI.
+Contains helper functions for working with OCR responses and image handling.
+"""
+import json
+import base64
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
+    """
+    Replace image placeholders in markdown with base64-encoded images.
+    Args:
+        markdown_str: Markdown text containing image placeholders
+        images_dict: Dictionary mapping image IDs to base64 strings
+    Returns:
+        Markdown text with images replaced by base64 data
+    """
+    for img_name, base64_str in images_dict.items():
+        markdown_str = markdown_str.replace(
+            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
+        )
+    return markdown_str
+def get_combined_markdown(ocr_response) -> str:
+    """
+    Combine OCR text and images into a single markdown document.
+    Ensures proper spacing between text and images.
+    Args:
+        ocr_response: Response from OCR processing containing text and images
+            See https://docs.mistral.ai/capabilities/document/ for API reference
+    Returns:
+        Combined markdown string with embedded images
+    """
+    markdowns: list[str] = []
+    # Extract images from page
+    for page in ocr_response.pages:
+        image_data = {}
+        for img in page.images:
+            image_data[img.id] = img.image_base64
+        # Replace image placeholders with actual images
+        page_markdown = replace_images_in_markdown(page.markdown, image_data)
+        # Ensure proper spacing between paragraphs and images
+        # Add extra newlines between paragraphs to improve rendering
+        page_markdown = page_markdown.replace("\n", "\n\n")
+        # Add page separator for multi-page documents
+        markdowns.append(page_markdown)
+    # Join pages with clear separators for multi-page documents
+    return "\n\n---\n\n".join(markdowns)
+def encode_image_for_api(image_path: Union[str, Path]) -> str:
+    """
+    Encode an image as base64 for API use.
+    Args:
+        image_path: Path to the image file
+    Returns:
+        Base64 data URL for the image
+    """
+    # Convert to Path object if string
+    image_file = Path(image_path) if isinstance(image_path, str) else image_path
+    # Verify image exists
+    if not image_file.is_file():
+        raise FileNotFoundError(f"Image file not found: {image_file}")
+    # Encode image as base64
+    encoded = base64.b64encode(image_file.read_bytes()).decode()
+    return f"data:image/jpeg;base64,{encoded}"
+def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
+    """
+    Process an image with OCR and return the response.
+    Args:
+        client: Mistral AI client
+        image_path: Path to the image file
+        model: OCR model to use
+    Returns:
+        OCR response object
+    """
+    # Encode image as base64
+    base64_data_url = encode_image_for_api(image_path)
+    # Process image with OCR
+    image_response = client.ocr.process(
+        document=ImageURLChunk(image_url=base64_data_url),
+        model=model
+    )
+    return image_response
+def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
+    """
+    Convert OCR response to a formatted JSON string.
+    Args:
+        ocr_response: OCR response object
+        indent: Indentation level for JSON formatting
+    Returns:
+        Formatted JSON string
+    """
+    # Convert response to JSON
+    response_dict = json.loads(ocr_response.model_dump_json())
+    return json.dumps(response_dict, indent=indent)
+# For display in notebooks
+try:
+    from IPython.display import Markdown, display
+    def display_ocr_with_images(ocr_response):
+        """
+        Display OCR response with embedded images in IPython environments.
+        Args:
+            ocr_response: OCR response object
+        """
+        combined_markdown = get_combined_markdown(ocr_response)
+        display(Markdown(combined_markdown))
+except ImportError:
+    # IPython not available
+    pass

backup/pdf_ocr.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""
+PDFOCR - Module for processing PDF files with OCR and extracting structured data.
+"""
+import json
+from pathlib import Path
+from structured_ocr import StructuredOCR
+class PDFOCR:
+    """Class for processing PDF files with OCR and extracting structured data."""
+    def __init__(self, api_key=None):
+        """Initialize the PDF OCR processor."""
+        self.processor = StructuredOCR(api_key=api_key)
+    def process_pdf(self, pdf_path, use_vision=True):
+        """
+        Process a PDF file with OCR and extract structured data.
+        Args:
+            pdf_path: Path to the PDF file
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Dictionary with structured OCR results
+        """
+        pdf_path = Path(pdf_path)
+        if not pdf_path.exists():
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
+    def save_json_output(self, pdf_path, output_path, use_vision=True):
+        """
+        Process a PDF file and save the structured output as JSON.
+        Args:
+            pdf_path: Path to the PDF file
+            output_path: Path where to save the JSON output
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Path to the saved JSON file
+        """
+        # Process the PDF
+        result = self.process_pdf(pdf_path, use_vision=use_vision)
+        # Save the result to JSON
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump(result, f, indent=2)
+        return output_path
+# For testing directly
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
+        sys.exit(1)
+    pdf_path = sys.argv[1]
+    output_path = sys.argv[2] if len(sys.argv) > 2 else None
+    processor = PDFOCR()
+    if output_path:
+        result_path = processor.save_json_output(pdf_path, output_path)
+        print(f"Results saved to: {result_path}")
+    else:
+        result = processor.process_pdf(pdf_path)
+        print(json.dumps(result, indent=2))

backup/requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit>=1.43.2
+mistralai>=0.0.7
+pydantic>=2.0.0
+pycountry>=23.12.11
+pillow>=10.0.0
+python-multipart>=0.0.6
+pdf2image>=1.17.0
+pytesseract>=0.3.10
+opencv-python-headless>=4.6.0
+numpy>=1.23.5

backup/structured_ocr.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import os
+import sys
+import time
+from enum import Enum
+from pathlib import Path
+import json
+import base64
+import pycountry
+import logging
+from pydantic import BaseModel
+from mistralai import Mistral
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Import utilities for OCR processing
+try:
+    from ocr_utils import replace_images_in_markdown, get_combined_markdown
+except ImportError:
+    # Define fallback functions if module not found
+    def replace_images_in_markdown(markdown_str, images_dict):
+        for img_name, base64_str in images_dict.items():
+            markdown_str = markdown_str.replace(
+                f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
+            )
+        return markdown_str
+    def get_combined_markdown(ocr_response):
+        markdowns = []
+        for page in ocr_response.pages:
+            image_data = {}
+            for img in page.images:
+                image_data[img.id] = img.image_base64
+            markdowns.append(replace_images_in_markdown(page.markdown, image_data))
+        return "\n\n".join(markdowns)
+# Import config directly (now local to historical-ocr)
+from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
+# Create language enum for structured output
+languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
+class LanguageMeta(Enum.__class__):
+    def __new__(metacls, cls, bases, classdict):
+        for code, name in languages.items():
+            classdict[name.upper().replace(' ', '_')] = name
+        return super().__new__(metacls, cls, bases, classdict)
+class Language(Enum, metaclass=LanguageMeta):
+    pass
+class StructuredOCRModel(BaseModel):
+    file_name: str
+    topics: list[str]
+    languages: list[Language]
+    ocr_contents: dict
+class StructuredOCR:
+    def __init__(self, api_key=None):
+        """Initialize the OCR processor with API key"""
+        self.api_key = api_key or MISTRAL_API_KEY
+        self.client = Mistral(api_key=self.api_key)
+    def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
+        """Process a file and return structured OCR results
+        Args:
+            file_path: Path to the file to process
+            file_type: 'pdf' or 'image' (will be auto-detected if None)
+            use_vision: Whether to use vision model for improved analysis
+            max_pages: Optional limit on number of pages to process
+            file_size_mb: Optional file size in MB (used for automatic page limiting)
+            custom_pages: Optional list of specific page numbers to process
+        Returns:
+            Dictionary with structured OCR results
+        """
+        # Convert file_path to Path object if it's a string
+        file_path = Path(file_path)
+        # Auto-detect file type if not provided
+        if file_type is None:
+            suffix = file_path.suffix.lower()
+            file_type = "pdf" if suffix == ".pdf" else "image"
+        # Get file size if not provided
+        if file_size_mb is None and file_path.exists():
+            file_size_mb = file_path.stat().st_size / (1024 * 1024)  # Convert bytes to MB
+        # Check if file exceeds API limits (50 MB)
+        if file_size_mb and file_size_mb > 50:
+            logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
+                "ocr_contents": {
+                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+                    "partial_text": "Document could not be processed due to size limitations."
+                }
+            }
+        # For PDF files, limit pages based on file size if no explicit limit is given
+        if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
+            if file_size_mb > 100:  # Very large files
+                max_pages = 3
+            elif file_size_mb > 50:  # Large files
+                max_pages = 5
+            elif file_size_mb > 20:  # Medium files
+                max_pages = 10
+            else:  # Small files
+                max_pages = None  # Process all pages
+        # Start processing timer
+        start_time = time.time()
+        # Read and process the file
+        if file_type == "pdf":
+            result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
+        else:
+            result = self._process_image(file_path, use_vision)
+        # Add processing time information
+        processing_time = time.time() - start_time
+        result['processing_time'] = processing_time
+        # Add a default confidence score if not present
+        if 'confidence_score' not in result:
+            result['confidence_score'] = 0.85  # Default confidence
+        return result
+    def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
+        """Process a PDF file with OCR
+        Args:
+            file_path: Path to the PDF file
+            use_vision: Whether to use vision model
+            max_pages: Optional limit on the number of pages to process
+            custom_pages: Optional list of specific page numbers to process
+        """
+        logger = logging.getLogger("pdf_processor")
+        logger.info(f"Processing PDF: {file_path}")
+        try:
+            # Upload the PDF file
+            logger.info("Uploading PDF file to Mistral API")
+            uploaded_file = self.client.files.upload(
+                file={
+                    "file_name": file_path.stem,
+                    "content": file_path.read_bytes(),
+                },
+                purpose="ocr",
+            )
+            # Get a signed URL for the uploaded file
+            signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
+            # Process the PDF with OCR
+            logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
+            pdf_response = self.client.ocr.process(
+                document=DocumentURLChunk(document_url=signed_url.url),
+                model=OCR_MODEL,
+                include_image_base64=True
+            )
+            # Limit pages if requested
+            pages_to_process = pdf_response.pages
+            total_pages = len(pdf_response.pages)
+            limited_pages = False
+            logger.info(f"PDF has {total_pages} total pages")
+            # Handle custom page selection if provided
+            if custom_pages:
+                # Convert to 0-based indexing and filter valid page numbers
+                valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
+                if valid_indices:
+                    pages_to_process = [pdf_response.pages[i] for i in valid_indices]
+                    limited_pages = True
+                    logger.info(f"Processing {len(valid_indices)} custom-selected pages")
+            # Otherwise handle max_pages limit
+            elif max_pages and total_pages > max_pages:
+                pages_to_process = pages_to_process[:max_pages]
+                limited_pages = True
+                logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
+            # Calculate average confidence score based on OCR response if available
+            confidence_score = 0.0
+            try:
+                # Some OCR APIs provide confidence scores
+                confidence_values = []
+                for page in pages_to_process:
+                    if hasattr(page, 'confidence'):
+                        confidence_values.append(page.confidence)
+                if confidence_values:
+                    confidence_score = sum(confidence_values) / len(confidence_values)
+                else:
+                    confidence_score = 0.85  # Default if no confidence scores available
+            except:
+                confidence_score = 0.85  # Default fallback
+            # Combine pages' markdown into a single string
+            all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
+            # Extract structured data using the appropriate model
+            if use_vision:
+                # Get base64 of first page for vision model
+                first_page_image = None
+                if pages_to_process and pages_to_process[0].images:
+                    first_page_image = pages_to_process[0].images[0].image_base64
+                if first_page_image:
+                    # Use vision model
+                    logger.info(f"Using vision model: {VISION_MODEL}")
+                    result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
+                else:
+                    # Fall back to text-only model if no image available
+                    logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
+                    result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+            else:
+                # Use text-only model
+                logger.info(f"Using text-only model: {TEXT_MODEL}")
+                result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+            # Add page limit info to result if needed
+            if limited_pages:
+                result['limited_pages'] = {
+                    'processed': len(pages_to_process),
+                    'total': total_pages
+                }
+            # Add confidence score
+            result['confidence_score'] = confidence_score
+            # Store the raw OCR response for image rendering
+            result['raw_response'] = pdf_response
+            logger.info(f"PDF processing completed successfully")
+            return result
+        except Exception as e:
+            logger.error(f"Error processing PDF: {str(e)}")
+            # Return basic result on error
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": str(e),
+                "ocr_contents": {
+                    "error": f"Failed to process PDF: {str(e)}",
+                    "partial_text": "Document could not be fully processed."
+                }
+            }
+    def _process_image(self, file_path, use_vision=True):
+        """Process an image file with OCR"""
+        logger = logging.getLogger("image_processor")
+        logger.info(f"Processing image: {file_path}")
+        try:
+            # Read and encode the image file
+            logger.info("Encoding image for API")
+            encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+            base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+            # Process the image with OCR
+            logger.info(f"Processing image with OCR using {OCR_MODEL}")
+            image_response = self.client.ocr.process(
+                document=ImageURLChunk(image_url=base64_data_url),
+                model=OCR_MODEL,
+                include_image_base64=True
+            )
+            # Get the OCR markdown from the first page
+            image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
+            # Calculate confidence score if available
+            confidence_score = 0.85  # Default value
+            try:
+                if hasattr(image_response.pages[0], 'confidence'):
+                    confidence_score = image_response.pages[0].confidence
+            except:
+                pass
+            # Extract structured data using the appropriate model
+            if use_vision:
+                logger.info(f"Using vision model: {VISION_MODEL}")
+                result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
+            else:
+                logger.info(f"Using text-only model: {TEXT_MODEL}")
+                result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
+            # Add confidence score
+            result['confidence_score'] = confidence_score
+            # Store the raw OCR response for image rendering
+            result['raw_response'] = image_response
+            logger.info("Image processing completed successfully")
+            return result
+        except Exception as e:
+            logger.error(f"Error processing image: {str(e)}")
+            # Return basic result on error
+            return {
+                "file_name": file_path.name,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "confidence_score": 0.0,
+                "error": str(e),
+                "ocr_contents": {
+                    "error": f"Failed to process image: {str(e)}",
+                    "partial_text": "Image could not be processed."
+                }
+            }
+    def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
+        """Extract structured data using vision model"""
+        try:
+            # Parse with vision model with a timeout
+            chat_response = self.client.chat.parse(
+                model=VISION_MODEL,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            ImageURLChunk(image_url=image_base64),
+                            TextChunk(text=(
+                                f"This is a historical document's OCR in markdown:\n"
+                                f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
+                                f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
+                                f"Extract topics, languages, and organize the content logically."
+                            ))
+                        ],
+                    },
+                ],
+                response_format=StructuredOCRModel,
+                temperature=0
+            )
+            # Convert the response to a dictionary
+            result = json.loads(chat_response.choices[0].message.parsed.json())
+            # Ensure languages is a list of strings, not Language enum objects
+            if 'languages' in result:
+                result['languages'] = [str(lang) for lang in result.get('languages', [])]
+        except Exception as e:
+            # Fall back to text-only model if vision model fails
+            print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
+            result = self._extract_structured_data_text_only(ocr_markdown, filename)
+        return result
+    def _extract_structured_data_text_only(self, ocr_markdown, filename):
+        """Extract structured data using text-only model"""
+        try:
+            # Parse with text-only model with a timeout
+            chat_response = self.client.chat.parse(
+                model=TEXT_MODEL,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": f"This is a historical document's OCR in markdown:\n"
+                                  f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
+                                  f"Convert this into a structured JSON response with the OCR contents. "
+                                  f"Extract topics, languages, and organize the content logically."
+                    },
+                ],
+                response_format=StructuredOCRModel,
+                temperature=0
+            )
+            # Convert the response to a dictionary
+            result = json.loads(chat_response.choices[0].message.parsed.json())
+            # Ensure languages is a list of strings, not Language enum objects
+            if 'languages' in result:
+                result['languages'] = [str(lang) for lang in result.get('languages', [])]
+        except Exception as e:
+            # Create a basic result if parsing fails
+            print(f"Text model failed: {str(e)}. Creating basic result.")
+            result = {
+                "file_name": filename,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "raw_text": ocr_markdown
+                }
+            }
+        return result
+# For testing directly
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python structured_ocr.py <file_path>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    processor = StructuredOCR()
+    result = processor.process_file(file_path)
+    print(json.dumps(result, indent=2))

config.py CHANGED Viewed

@@ -4,64 +4,14 @@ Configuration file for Mistral OCR processing.
 Contains API key and other settings.
 """
 import os
-import logging
-from dotenv import load_dotenv
-# Configure logging
-logger = logging.getLogger("config")
-# Load environment variables from .env file if it exists
-load_dotenv()
-# Mistral API key handling - prioritizing Hugging Face environment
-# Priority order:
-# 1. HF_API_KEY environment variable (Hugging Face standard)
-# 2. HUGGING_FACE_API_KEY environment variable (alternative name)
-# 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
-# 4. MISTRAL_API_KEY environment variable (fallback)
-# 5. Empty string (will show warning in app)
-MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
-                  os.environ.get("HUGGING_FACE_API_KEY",
-                  os.environ.get("HF_MISTRAL_API_KEY",
-                  os.environ.get("MISTRAL_API_KEY", "")))).strip()
-if not MISTRAL_API_KEY:
-    logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
-# Check if we're in test mode (allows operation without valid API key)
-# Set to False to use actual API calls with Mistral API
-TEST_MODE = False
-# Model settings with fallbacks
-OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
-TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest")  # Updated from ministral-8b-latest
-VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest")  # faster model that supports vision
-# Image preprocessing settings optimized for historical documents
-# These can be customized from environment variables
-IMAGE_PREPROCESSING = {
-    "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")),  # Increased contrast for better text recognition
-    "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
-    "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
-    "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")),    # Increased size limit for better quality
-    "target_dpi": int(os.environ.get("TARGET_DPI", "300")),               # Target DPI for scaling
-    "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")),  # Higher quality for better OCR results
-    # # Enhanced settings for handwritten documents
-    "handwritten": {
-        "block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
-        "constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")),      # Lower constant for adaptive thresholding
-        "use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"),  # Connect broken strokes
-        "dilation_iterations": int(os.environ.get("HANDWRITTEN_DILATION_ITERATIONS", "2")),  # More iterations for better stroke connection
-        "dilation_kernel_size": int(os.environ.get("HANDWRITTEN_DILATION_KERNEL_SIZE", "3"))       # Larger kernel for dilation
-    }
-}
-# OCR settings optimized for single-page performance
-OCR_SETTINGS = {
-    "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")),         # Shorter timeout for single pages (45 seconds)
-    "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")),           # Fewer retries to avoid rate-limiting
-    "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")),           # Shorter initial retry delay for faster execution
-    "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
-    "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2"))          # Lower thread count to prevent API rate limiting
-}

 Contains API key and other settings.
 """
 import os
+# Your Mistral API key - get from Hugging Face secrets or environment variable
+# The priority order is: HF_SPACES environment var > regular environment var > empty string
+# Note: No default API key is provided for security reasons
+MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY",  # First check HF-specific env var
+                  os.environ.get("MISTRAL_API_KEY", ""))  # Then check regular env var
+# Model settings
+OCR_MODEL = "mistral-ocr-latest"
+TEXT_MODEL = "ministral-8b-latest"
+VISION_MODEL = "pixtral-12b-latest"

constants.py DELETED Viewed

@@ -1,193 +0,0 @@
-"""
-Constants for the Historical OCR application.
-This module contains all the constants used throughout the application,
-making it easier to maintain and update values in one place.
-"""
-# API limits
-MAX_FILE_SIZE_MB = 200
-MAX_PAGES = 20
-# Caching
-CACHE_TTL_SECONDS = 24 * 3600  # 24 hours
-MAX_CACHE_ENTRIES = 20
-# Image processing
-MAX_IMAGE_DIMENSION = 2500
-IMAGE_QUALITY = 100
-# Document types
-DOCUMENT_TYPES = [
-    "Auto-detect (standard processing)",
-    "Newspaper or Magazine",
-    "Letter or Correspondence",
-    "Book or Publication",
-    "Form or Legal Document",
-    "Recipe",
-    "Handwritten Document",
-    "Map or Illustration",
-    "Table or Spreadsheet",
-    "Other (specify in instructions)"
-]
-# Document layouts
-DOCUMENT_LAYOUTS = [
-    "Standard layout",
-    "Multiple columns",
-    "Table/grid format",
-    "Mixed layout with images"
-]
-# Preprocessing document types
-PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
-# Rotation options
-ROTATION_OPTIONS = [0, 90, 180, 270]
-# PDF settings
-DEFAULT_PDF_DPI = 100
-MIN_PDF_DPI = 72
-MAX_PDF_DPI = 300
-DEFAULT_MAX_PAGES = 3
-# Performance modes
-PERFORMANCE_MODES = ["Quality", "Speed"]
-# Custom prompt templates
-CUSTOM_PROMPT_TEMPLATES = {
-    "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
-    "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
-    "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
-    "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
-    "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
-    "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
-    "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
-    "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
-    "Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
-}
-# Layout prompt additions
-LAYOUT_PROMPT_ADDITIONS = {
-    "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
-    "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
-    "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
-}
-# Content themes for subject tag extraction
-CONTENT_THEMES = {
-    # Historical Periods
-    "Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"],
-    "Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"],
-    "Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"],
-    "Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"],
-    "Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"],
-    "18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"],
-    "19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"],
-    "20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"],
-    "Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"],
-    # Geographic Contexts
-    "European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"],
-    "Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"],
-    "African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"],
-    "American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"],
-    "Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"],
-    "Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"],
-    # Historical Methodologies & Approaches
-    "Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"],
-    "Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"],
-    "Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"],
-    "Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"],
-    "Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"],
-    # Historical Document Types
-    "Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"],
-    "Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"],
-    "Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"],
-    "Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"],
-    "Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"],
-    "Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"],
-    # Historical Themes & Movements
-    "Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"],
-    "Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"],
-    "Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"],
-    "Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"],
-    "Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"],
-    "Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"],
-    "Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"],
-    "Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"],
-    "Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"],
-    # Specialized Historical Topics
-    "Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"],
-    "Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"],
-    "Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"],
-    "Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"],
-    "Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"],
-    "Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"],
-    "Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"],
-    "Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"],
-    # General Historical Terms
-    "Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"],
-    "Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"],
-    "Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"]
-}
-# Period tags based on year ranges
-# These ranges are used to assign historical period tags to documents based on their year.
-PERIOD_TAGS = {
-    (0, 499): "Ancient Era (to 500 CE)",
-    (500, 999): "Early Medieval (500–1000)",
-    (1000, 1299): "High Medieval (1000–1300)",
-    (1300, 1499): "Late Medieval (1300–1500)",
-    (1500, 1599): "Renaissance (1500–1600)",
-    (1600, 1699): "Early Modern (1600–1700)",
-    (1700, 1775): "Enlightenment (1700–1775)",
-    (1776, 1799): "Age of Revolutions (1776–1800)",
-    (1800, 1849): "Early 19th Century (1800–1850)",
-    (1850, 1899): "Late 19th Century (1850–1900)",
-    (1900, 1918): "Early 20th Century & WWI (1900–1918)",
-    (1919, 1938): "Interwar Period (1919–1938)",
-    (1939, 1945): "World War II (1939–1945)",
-    (1946, 1968): "Postwar & Mid-20th Century (1946–1968)",
-    (1969, 1989): "Late 20th Century (1969–1989)",
-    (1990, 2000): "Turn of the 21st Century (1990–2000)",
-    (2001, 2099): "Contemporary (21st Century)"
-}
-# Default fallback tags for documents when no specific tags are detected.
-DEFAULT_TAGS = [
-    "Document",
-    "Historical",
-    "Text",
-    "Primary Source",
-    "Archival Material",
-    "Record",
-    "Manuscript",
-    "Printed Material",
-    "Correspondence",
-    "Publication"
-]
-# Generic tags that can be used for broad categorization or as supplemental tags.
-GENERIC_TAGS = [
-    "Archive",
-    "Content",
-    "Record",
-    "Source",
-    "Material",
-    "Page",
-    "Scan",
-    "Image",
-    "Transcription",
-    "Uncategorized",
-    "General",
-    "Miscellaneous"
-]
-# UI constants
-PROGRESS_DELAY = 0.8  # Seconds to show completion message

error_handler.py DELETED Viewed

@@ -1,65 +0,0 @@
-import logging
-import streamlit as st
-import time
-from constants import MAX_FILE_SIZE_MB
-# Configure logging
-logger = logging.getLogger("error_handler")
-logger.setLevel(logging.INFO)
-def handle_ocr_error(exception, progress_reporter=None):
-    """
-    Handle OCR processing errors and provide user-friendly messages
-    Args:
-        exception: The exception that occurred
-        progress_reporter: ProgressReporter instance for UI updates
-    Returns:
-        str: User-friendly error message
-    """
-    error_message = str(exception)
-    # Complete progress reporting if provided
-    if progress_reporter:
-        progress_reporter.complete(success=False)
-    # Check for specific error types and provide helpful user-facing messages
-    if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
-        friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
-        logger.error(f"Rate limit error: {error_message}")
-        return friendly_message
-    elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
-        friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
-        logger.error(f"API quota error: {error_message}")
-        return friendly_message
-    elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
-        friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
-        logger.error(f"Timeout error: {error_message}")
-        return friendly_message
-    elif "file size" in error_message.lower() or "too large" in error_message.lower():
-        friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
-        logger.error(f"File size error: {error_message}")
-        return friendly_message
-    else:
-        # Generic error message for other errors
-        logger.error(f"OCR processing error: {error_message}", exc_info=True)
-        return f"An error occurred during processing: {error_message}"
-def check_file_size(file_bytes):
-    """
-    Check if file size is within limits
-    Args:
-        file_bytes: File content as bytes
-    Returns:
-        tuple: (is_valid, file_size_mb, error_message)
-    """
-    file_size_mb = len(file_bytes) / (1024 * 1024)
-    if file_size_mb > MAX_FILE_SIZE_MB:
-        error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
-        return False, file_size_mb, error_message
-    return True, file_size_mb, None

image_segmentation.py DELETED Viewed

@@ -1,253 +0,0 @@
-"""
-Image segmentation utility for OCR preprocessing.
-Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
-Uses content-aware adaptive segmentation for improved results across document types.
-"""
-import cv2
-import numpy as np
-from PIL import Image
-import io
-import base64
-import logging
-from pathlib import Path
-from typing import Tuple, List, Dict, Union, Optional
-# Configure logging
-logging.basicConfig(level=logging.INFO,
-                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
-    """
-    Prepare image for OCR processing using content-aware segmentation.
-    Uses adaptive region detection based on text density analysis.
-    Args:
-        image_path: Path to the image file
-        vision_enabled: Whether the vision model is enabled
-        preserve_content: Whether to preserve original content without enhancement
-    Returns:
-        Dict containing segmentation results
-    """
-    # Convert to Path object if string
-    image_file = Path(image_path) if isinstance(image_path, str) else image_path
-    # Log start of processing
-    logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
-    try:
-        # Open original image with PIL
-        with Image.open(image_file) as pil_img:
-            # Check for low entropy images when vision is disabled
-            if not vision_enabled:
-                from utils.image_utils import calculate_image_entropy
-                ent = calculate_image_entropy(pil_img)
-                if ent < 3.5:  # Likely line-art or blank page
-                    logger.info(f"Low entropy image detected ({ent:.2f}), classifying as illustration")
-                    return {
-                        'text_regions': None,
-                        'image_regions': pil_img,
-                        'text_mask_base64': None,
-                        'combined_result': None,
-                        'text_regions_coordinates': []
-                    }
-            # Convert to RGB if needed
-            if pil_img.mode != 'RGB':
-                pil_img = pil_img.convert('RGB')
-            # Get image dimensions
-            img_np = np.array(pil_img)
-            img_width, img_height = pil_img.size
-            # Analyze text density to determine if advanced segmentation is needed
-            # This replaces document-specific logic with content-aware analysis
-            from utils.image_utils import estimate_text_density
-            text_density = estimate_text_density(img_np)
-            # Use adaptive approach for documents with unusual text distribution
-            if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
-                logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
-                # Detect content regions based on text density
-                from utils.text_utils import detect_content_regions
-                regions = detect_content_regions(img_np)
-                # Create visualization with green borders around the text regions
-                vis_img = img_np.copy()
-                # Draw regions on visualization
-                for x, y, w, h in regions:
-                    cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
-                # Add text to indicate we're using adaptive processing
-                font = cv2.FONT_HERSHEY_SIMPLEX
-                cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
-                # Create visualization images
-                text_regions_vis = Image.fromarray(vis_img)
-                image_regions_vis = text_regions_vis.copy()
-                # Create a mask highlighting the text regions
-                text_mask = np.zeros((img_height, img_width), dtype=np.uint8)
-                for x, y, w, h in regions:
-                    text_mask[y:y+h, x:x+w] = 255
-                _, buffer = cv2.imencode('.png', text_mask)
-                text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
-                # Extract region images
-                region_images = []
-                for i, (x, y, w, h) in enumerate(regions):
-                    region = img_np[y:y+h, x:x+w].copy()
-                    region_pil = Image.fromarray(region)
-                    region_info = {
-                        'image': region,
-                        'pil_image': region_pil,
-                        'coordinates': (x, y, w, h),
-                        'padded_coordinates': (x, y, w, h),
-                        'order': i
-                    }
-                    region_images.append(region_info)
-                # Return the adaptive segmentation results
-                return {
-                    'text_regions': text_regions_vis,
-                    'image_regions': image_regions_vis,
-                    'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
-                    'combined_result': pil_img,
-                    'text_regions_coordinates': regions,
-                    'region_images': region_images,
-                    'segmentation_type': 'adaptive'
-                }
-            else:
-                # SIMPLIFIED APPROACH for most documents
-                # Let Mistral OCR handle the entire document understanding process
-                logger.info(f"Using standard approach for document with uniform text density")
-                # For visualization, mark the entire image as a text region
-                full_image_region = [(0, 0, img_width, img_height)]
-                # Create visualization with a simple border
-                vis_img = img_np.copy()
-                cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
-                # Add text to indicate this is using Mistral's native processing
-                font = cv2.FONT_HERSHEY_SIMPLEX
-                cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
-                # Create visualizations and masks
-                text_regions_vis = Image.fromarray(vis_img)
-                image_regions_vis = text_regions_vis.copy()
-                # Create a mask of the entire image (just for visualization)
-                text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
-                _, buffer = cv2.imencode('.png', text_mask)
-                text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
-                # Return the original image as the combined result
-                return {
-                    'text_regions': text_regions_vis,
-                    'image_regions': image_regions_vis,
-                    'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
-                    'combined_result': pil_img,
-                    'text_regions_coordinates': full_image_region,
-                    'region_images': [{
-                        'image': img_np,
-                        'pil_image': pil_img,
-                        'coordinates': (0, 0, img_width, img_height),
-                        'padded_coordinates': (0, 0, img_width, img_height),
-                        'order': 0
-                    }],
-                    'segmentation_type': 'simplified'
-                }
-    except Exception as e:
-        logger.error(f"Error segmenting image {image_file.name}: {str(e)}")
-        # Return None values if processing fails
-        return {
-            'text_regions': None,
-            'image_regions': None,
-            'text_mask_base64': None,
-            'combined_result': None,
-            'text_regions_coordinates': []
-        }
-def process_segmented_image(image_path: Union[str, Path], output_dir: Optional[Path] = None, preserve_content: bool = True) -> Dict:
-    """
-    Process an image using segmentation for improved OCR, saving visualization outputs.
-    Args:
-        image_path: Path to the image file
-        output_dir: Optional directory to save visualization outputs
-    Returns:
-        Dictionary with processing results and paths to output files
-    """
-    # Convert to Path object if string
-    image_file = Path(image_path) if isinstance(image_path, str) else image_path
-    # Create output directory if not provided
-    if output_dir is None:
-        output_dir = Path("output") / "segmentation"
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # Process the image with segmentation
-    segmentation_results = segment_image_for_ocr(image_file)
-    # Prepare results dictionary
-    results = {
-        'original_image': str(image_file),
-        'output_files': {}
-    }
-    # Save visualization outputs if segmentation was successful
-    if segmentation_results['text_regions'] is not None:
-        # Save text regions visualization
-        text_regions_path = output_dir / f"{image_file.stem}_text_regions.jpg"
-        segmentation_results['text_regions'].save(text_regions_path)
-        results['output_files']['text_regions'] = str(text_regions_path)
-        # Save image regions visualization
-        image_regions_path = output_dir / f"{image_file.stem}_image_regions.jpg"
-        segmentation_results['image_regions'].save(image_regions_path)
-        results['output_files']['image_regions'] = str(image_regions_path)
-        # Save combined result
-        combined_path = output_dir / f"{image_file.stem}_combined.jpg"
-        segmentation_results['combined_result'].save(combined_path)
-        results['output_files']['combined_result'] = str(combined_path)
-        # Save text mask visualization
-        text_mask_path = output_dir / f"{image_file.stem}_text_mask.png"
-        # Save text mask from base64
-        if segmentation_results['text_mask_base64']:
-            base64_data = segmentation_results['text_mask_base64'].split(',')[1]
-            with open(text_mask_path, 'wb') as f:
-                f.write(base64.b64decode(base64_data))
-            results['output_files']['text_mask'] = str(text_mask_path)
-        # Add detected text regions count
-        results['text_regions_count'] = len(segmentation_results['text_regions_coordinates'])
-        results['text_regions_coordinates'] = segmentation_results['text_regions_coordinates']
-    return results
-if __name__ == "__main__":
-    # Simple test - process a sample image if run directly
-    import sys
-    if len(sys.argv) > 1:
-        image_path = sys.argv[1]
-    else:
-        image_path = "input/handwritten-journal.jpg" # Example image path"
-    logger.info(f"Testing image segmentation on {image_path}")
-    results = process_segmented_image(image_path)
-    # Print results summary
-    logger.info(f"Segmentation complete. Found {results.get('text_regions_count', 0)} text regions.")
-    logger.info(f"Output files saved to: {[path for path in results.get('output_files', {}).values()]}")

input/The Magician, or Bottle Cungerer.jpeg ADDED Viewed

Git LFS Details

SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
Pointer size: 132 Bytes
Size of remote file: 2.96 MB

input/baldwin-letter-1.jpg ADDED Viewed

Git LFS Details

SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
Pointer size: 131 Bytes
Size of remote file: 135 kB

input/baldwin-letter-2.jpg ADDED Viewed

Git LFS Details

SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
Pointer size: 131 Bytes
Size of remote file: 114 kB

input/flier.png ADDED Viewed

input/harpers.pdf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3c9030714b07bb5f7c9adf8b175975baa9b4f40402da62d69cad9b0d4ba61b94
-size 14931299