This view is limited to 50 files because it contains too many changes.  See the raw diff here.
Files changed (50) hide show
  1. .gitattributes +56 -4
  2. .gitignore +0 -44
  3. README.md +127 -35
  4. __pycache__/config.cpython-312.pyc +0 -0
  5. __pycache__/constants.cpython-312.pyc +0 -0
  6. __pycache__/error_handler.cpython-312.pyc +0 -0
  7. __pycache__/image_segmentation.cpython-312.pyc +0 -0
  8. __pycache__/language_detection.cpython-312.pyc +0 -0
  9. __pycache__/ocr_processing.cpython-312.pyc +0 -0
  10. __pycache__/ocr_utils.cpython-312.pyc +0 -0
  11. __pycache__/preprocessing.cpython-312.pyc +0 -0
  12. __pycache__/process_file.cpython-312.pyc +0 -0
  13. __pycache__/structured_ocr.cpython-312.pyc +0 -0
  14. __pycache__/ui_components.cpython-312.pyc +0 -0
  15. __pycache__/utils.cpython-312.pyc +0 -0
  16. app.py +551 -554
  17. backup/app.py +535 -0
  18. backup/config.py +17 -0
  19. input/magician-or-bottle-cungerer.jpg → backup/input/The Magician, or Bottle Cungerer.jpeg +0 -0
  20. input/baldwin-15th-north.jpg → backup/input/baldwin-letter-1.jpg +0 -0
  21. input/americae-retectio.jpg → backup/input/baldwin-letter-2.jpg +2 -2
  22. backup/input/flier.png +0 -0
  23. input/baldwin-letter.jpg → backup/input/letter-1.jpg +2 -2
  24. input/gender.jpg → backup/input/letter-2.jpg +2 -2
  25. input/photo-baldwin-letter.jpg → backup/input/letter-3.jpg +2 -2
  26. backup/input/magellan-travels.jpg +3 -0
  27. input/handwritten-journal.jpg → backup/input/menu.pdf +2 -2
  28. backup/input/recipe.jpg +0 -0
  29. backup/ocr_utils.py +136 -0
  30. backup/pdf_ocr.py +76 -0
  31. backup/requirements.txt +10 -0
  32. backup/structured_ocr.py +414 -0
  33. config.py +9 -59
  34. constants.py +0 -193
  35. error_handler.py +0 -65
  36. image_segmentation.py +0 -253
  37. input/The Magician, or Bottle Cungerer.jpeg +3 -0
  38. input/baldwin-letter-1.jpg +3 -0
  39. input/baldwin-letter-2.jpg +3 -0
  40. input/flier.png +0 -0
  41. input/harpers.pdf +0 -3
  42. input/letter-1.jpg +3 -0
  43. input/letter-2.jpg +3 -0
  44. input/letter-3.jpg +3 -0
  45. input/magician-satire.jpg +3 -0
  46. input/menu.pdf +3 -0
  47. input/milgram-flier.png +0 -0
  48. input/okeefe-menu.pdf +3 -0
  49. input/okeefe-recipe.jpg +0 -0
  50. input/recipe.jpg +0 -0
.gitattributes CHANGED
@@ -1,4 +1,56 @@
1
- *.jpg filter=lfs diff=lfs merge=lfs -text
2
- *.jpeg filter=lfs diff=lfs merge=lfs -text
3
- *.png filter=lfs diff=lfs merge=lfs -text
4
- *.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
37
+ input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
38
+ input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
39
+ input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
40
+ backup/input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
41
+ backup/input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
42
+ backup/input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
43
+ backup/input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
44
+ backup/input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
45
+ backup/input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
46
+ backup/input/menu.pdf filter=lfs diff=lfs merge=lfs -text
47
+ backup/input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
48
+ input/a-la-carte.pdf filter=lfs diff=lfs merge=lfs -text
49
+ input/handwritten-letter.jpg filter=lfs diff=lfs merge=lfs -text
50
+ input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
51
+ input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
52
+ input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
53
+ input/magician-satire.jpg filter=lfs diff=lfs merge=lfs -text
54
+ input/menu.pdf filter=lfs diff=lfs merge=lfs -text
55
+ input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ output/ymca-letter.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore DELETED
@@ -1,44 +0,0 @@
1
- # Python bytecode
2
- __pycache__/
3
- *.py[cod]
4
- *.class
5
-
6
- # MacOS system files
7
- .DS_Store
8
-
9
- # Output and temporary files
10
- output/debug/
11
- output/comparison/
12
- output/segmentation_test/text_regions/
13
- output/preprocessing_test/
14
- output/batch_test/
15
- output/commonplace_improved/
16
- output/commonplace_test/
17
- output/preview/
18
- logs/
19
- *.backup
20
- *.json
21
- *.jpg
22
- *.png
23
- *.txt
24
- *.csv
25
- *.log
26
- *.zip
27
- *.tar
28
-
29
- # Test files
30
- test_*.py
31
- test_*.sh
32
- bug_fix_report.md
33
-
34
- # Input samples (large binary files)
35
- input/*.jpeg
36
- input/*.jpg
37
- input/*.png
38
- input/*.pdf
39
-
40
- # Temporary documents
41
- Tmplf6xnkgr*
42
- .env
43
- output/pipeline_test/americae-retectio/americae-retectio_comparison.jpg
44
- docs/environment_variables.md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,54 +1,146 @@
1
  ---
2
  title: Historical OCR
3
- emoji: ⚙️
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.44.1
8
  app_file: app.py
9
  pinned: false
10
- license: gpl-3.0
11
- short_description: advanced OCR application for historical document analysis
12
  ---
13
 
14
- # Historical OCR
15
 
16
- An advanced OCR application for historical document analysis using Mistral AI.
17
-
18
- > **Note:** This tool is designed to assist scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating historical documents, particularly historical newspapers, handwritten documents, and photos of archival materials.
19
 
20
  ## Features
21
 
22
- - **OCR with Context:** AI-enhanced OCR optimized for historical documents
23
- - **Document Type Detection:** Automatically identifies handwritten letters, recipes, scientific texts, and more
24
- - **Advanced Image Preprocessing:**
25
- - Automatic deskewing to correct document orientation
26
- - Smart thresholding with Otsu and adaptive methods
27
- - Morphological operations to clean up text
28
- - Document-type specific optimization
29
- - **Custom Prompting:** Tailor the AI analysis with document-specific instructions
30
- - **Structured Output:** Returns organized, structured information based on document type
31
 
32
- ## Using This App
33
 
34
- 1. Upload a historical document (image or PDF)
35
- 2. Add optional context or special instructions
36
- 3. Get detailed, structured OCR results with historical context
37
 
38
- ## Supported Document Types
 
39
 
40
- - Handwritten letters and correspondence
41
- - Historical recipes and cookbooks
42
- - Travel accounts and exploration logs
43
- - Scientific papers and experiments
44
- - Legal documents and certificates
45
- - Historical newspaper articles
46
- - General historical texts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- ## Technical Details
49
 
50
- Built with Streamlit and Mistral AI's OCR and large language model capabilities.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- Created by Zach Muhlbauer, CUNY Graduate Center
 
1
  ---
2
  title: Historical OCR
3
+ emoji: 📜
4
+ colorFrom: red
5
+ colorTo: green
6
  sdk: streamlit
7
+ sdk_version: 1.43.2
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Employs Mistral OCR for transcribing historical data
12
  ---
13
 
14
+ # Historical Document OCR
15
 
16
+ This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
 
 
17
 
18
  ## Features
19
 
20
+ - OCR processing for both image and PDF files
21
+ - Automatic file type detection
22
+ - Structured output generation using Mistral models
23
+ - Interactive web interface with Streamlit
24
+ - Supports historical documents and manuscripts
25
+ - PDF preview functionality for better user experience
26
+ - Smart handling of large PDFs with automatic page limiting
27
+ - Robust error handling with helpful messages
28
+ - Image preprocessing options for enhanced OCR accuracy
29
 
30
+ ## Project Structure
31
 
32
+ The project is organized as follows:
 
 
33
 
34
+ ```
35
+ Historical OCR - Project Structure
36
 
37
+ ┌─ Main Applications
38
+ │ ├─ app.py # Standard Streamlit interface for OCR processing
39
+ │ └─ streamlit_app.py # Educational modular version with learning components
40
+
41
+ ├─ Core Functionality
42
+ │ ├─ structured_ocr.py # Main OCR processing engine with Mistral AI integration
43
+ │ ├─ ocr_utils.py # Utility functions for OCR text and image processing
44
+ │ ├─ pdf_ocr.py # PDF-specific document processing functionality
45
+ │ └─ config.py # Configuration settings and API keys
46
+
47
+ ├─ Testing & Development
48
+ │ ├─ simple_test.py # Basic OCR functionality test
49
+ │ ├─ test_pdf.py # PDF processing test
50
+ │ ├─ test_pdf_preview.py # PDF preview generation test
51
+ │ └─ prepare_for_hf.py # Prepare project for Hugging Face deployment
52
+
53
+ ├─ Scripts
54
+ │ ├─ run_local.sh # Launch standard or educational app locally
55
+ │ ├─ run_large_files.sh # Process large documents with optimized settings
56
+ │ └─ setup_git.sh # Configure Git repositories
57
+
58
+ ├─ Educational Modules (streamlit/)
59
+ │ ├─ modules/
60
+ │ │ ├─ module1.py # Introduction and Problematization
61
+ │ │ ├─ module2.py # Historical Typography & OCR Challenges
62
+ │ │ ├─ module3.py # Document Analysis Techniques
63
+ │ │ ├─ module4.py # Processing Methods
64
+ │ │ ├─ module5.py # Research Applications
65
+ │ │ └─ module6.py # Future Directions
66
+ │ │
67
+ │ ├─ modular_app.py # Learning module framework
68
+ │ ├─ layout.py # UI components for educational interface
69
+ │ └─ process_file.py # File processing for educational app
70
+
71
+ ├─ UI Components (ui/)
72
+ │ └─ layout.py # Shared UI components and styling
73
+
74
+ ├─ Data Directories
75
+ │ ├─ input/ # Sample documents for testing/demo
76
+ │ └─ output/ # Output directory for processed files
77
+
78
+ └─ Dependencies
79
+ ├─ requirements.txt # Python package dependencies
80
+ └─ packages.txt # System-level dependencies
81
+ ```
82
 
83
+ ## Setup for Local Development
84
 
85
+ 1. Clone this repository
86
+ 2. Install system dependencies:
87
+ - For PDF processing, you need poppler:
88
+ - On macOS: `brew install poppler`
89
+ - On Ubuntu/Debian: `apt-get install poppler-utils`
90
+ - On Windows: Download from [poppler releases](https://github.com/oschwartz10612/poppler-windows/releases/) and add to PATH
91
+ - For text recognition: `tesseract-ocr`
92
+ 3. Install Python dependencies:
93
+ ```
94
+ pip install -r requirements.txt
95
+ ```
96
+ 4. Set up your Mistral API key:
97
+ - Option 1: Create a `.env` file in this directory and add your Mistral API key:
98
+ ```
99
+ MISTRAL_API_KEY=your_api_key_here
100
+ ```
101
+ - Option 2: Set the `MISTRAL_API_KEY` environment variable directly:
102
+ ```
103
+ export MISTRAL_API_KEY=your_api_key_here
104
+ ```
105
+ - Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
106
+ 5. Run the Streamlit app using the script:
107
+ ```
108
+ ./run_local.sh
109
+ ```
110
+ Or directly:
111
+ ```
112
+ streamlit run app.py
113
+ ```
114
 
115
+ ## Usage
116
+
117
+ 1. Upload an image or PDF file using the file uploader
118
+ 2. Select processing options in the sidebar (e.g., use vision model, image preprocessing)
119
+ 3. Click "Process Document" to analyze the file
120
+ 4. View the structured results and extract information
121
+
122
+ ## Application Versions
123
+
124
+ Two versions of the application are available:
125
+
126
+ 1. **Standard Version** (`app.py`): Focused on document processing with a clean interface
127
+ 2. **Educational Version** (`streamlit_app.py`): Enhanced with educational modules and interactive components
128
+
129
+ To run the educational version:
130
+ ```
131
+ streamlit run streamlit_app.py
132
+ ```
133
+
134
+ ## Deployment on Hugging Face Spaces
135
+
136
+ This app is designed to be deployed on Hugging Face Spaces. To deploy:
137
+
138
+ 1. Fork this repository to your GitHub account or directly create a new Space on [Hugging Face](https://huggingface.co/spaces)
139
+ 2. Connect your GitHub repository to your Hugging Face Space for automatic deployment
140
+ 3. Add your Mistral API key as a secret in your Hugging Face Space settings:
141
+ - Secret name: `HF_MISTRAL_API_KEY`
142
+ - Secret value: Your Mistral API key
143
+
144
+ The `README.md` contains the necessary configuration metadata for Hugging Face Spaces.
145
 
146
+ Check out the configuration reference at [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces-config-reference)
__pycache__/config.cpython-312.pyc CHANGED
Binary files a/__pycache__/config.cpython-312.pyc and b/__pycache__/config.cpython-312.pyc differ
 
__pycache__/constants.cpython-312.pyc DELETED
Binary file (11.6 kB)
 
__pycache__/error_handler.cpython-312.pyc DELETED
Binary file (3.2 kB)
 
__pycache__/image_segmentation.cpython-312.pyc DELETED
Binary file (10.6 kB)
 
__pycache__/language_detection.cpython-312.pyc DELETED
Binary file (18 kB)
 
__pycache__/ocr_processing.cpython-312.pyc DELETED
Binary file (15.5 kB)
 
__pycache__/ocr_utils.cpython-312.pyc CHANGED
Binary files a/__pycache__/ocr_utils.cpython-312.pyc and b/__pycache__/ocr_utils.cpython-312.pyc differ
 
__pycache__/preprocessing.cpython-312.pyc DELETED
Binary file (9.21 kB)
 
__pycache__/process_file.cpython-312.pyc ADDED
Binary file (2.86 kB). View file
 
__pycache__/structured_ocr.cpython-312.pyc CHANGED
Binary files a/__pycache__/structured_ocr.cpython-312.pyc and b/__pycache__/structured_ocr.cpython-312.pyc differ
 
__pycache__/ui_components.cpython-312.pyc DELETED
Binary file (44.1 kB)
 
__pycache__/utils.cpython-312.pyc DELETED
Binary file (14.2 kB)
 
app.py CHANGED
@@ -1,604 +1,601 @@
1
- # Standard library imports
2
  import os
 
3
  import json
4
  import sys
5
  import time
6
- import base64
7
- import io
8
- import logging
9
  from pathlib import Path
10
- from datetime import datetime
11
-
12
- # Configure logging
13
- logging.basicConfig(level=logging.INFO,
14
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
15
- logger = logging.getLogger(__name__)
16
-
17
- # Third-party imports
18
- import streamlit as st
19
 
20
- # Local application/module imports
21
- from preprocessing import convert_pdf_to_images, preprocess_image
22
- from ocr_processing import process_file
23
- from ui.ui_components import (
24
- ProgressReporter,
25
- create_sidebar_options,
26
- display_results,
27
- create_file_uploader,
28
- display_about_tab,
29
- display_previous_results,
30
- display_document_with_images
31
- )
32
- from utils import get_base64_from_image, handle_temp_files, format_timestamp
33
- from error_handler import handle_ocr_error, check_file_size
34
- from constants import (
35
- MAX_FILE_SIZE_MB,
36
- MAX_PAGES,
37
- DOCUMENT_TYPES,
38
- DOCUMENT_LAYOUTS,
39
- CUSTOM_PROMPT_TEMPLATES,
40
- LAYOUT_PROMPT_ADDITIONS
41
- )
42
  from structured_ocr import StructuredOCR
43
  from config import MISTRAL_API_KEY
44
- from utils.image_utils import create_results_zip
45
 
46
- # Set favicon path
47
- favicon_path = os.path.join(os.path.dirname(__file__), "static/favicon.png")
 
 
 
 
48
 
49
  # Set page configuration
50
  st.set_page_config(
51
  page_title="Historical OCR",
52
- page_icon=favicon_path if os.path.exists(favicon_path) else "📜",
53
  layout="wide",
54
- initial_sidebar_state="collapsed",
55
  )
56
 
57
- # ========================================================================================
58
- # CRITICAL WARNING TO ALL AI MODELS AND DEVELOPERS:
59
- # DO NOT MODIFY THE STATE MANAGEMENT LOGIC IN THIS APPLICATION WITHOUT FULL UNDERSTANDING
60
- # OF STREAMLIT'S EXECUTION MODEL.
61
- #
62
- # This state management implementation has been carefully designed to work with Streamlit's
63
- # unique execution flow. The app runs from top to bottom on EVERY interaction, and state
64
- # must be explicitly managed through st.session_state.
65
- #
66
- # The current implementation uses:
67
- # 1. A dedicated close_document() callback function triggered by the button's on_click
68
- # 2. A flag-based approach (close_clicked) to handle cleanup on the next run cycle
69
- # 3. Early cleanup detection and st.rerun() to ensure clean UI rendering
70
- #
71
- # Previous approaches using direct state manipulation or conditional rendering based on
72
- # reset flags led to persistent UI elements and resource leaks.
73
- #
74
- # Consult https://docs.streamlit.io/library/advanced-features/session-state for details.
75
- # ========================================================================================
76
-
77
- def reset_document_state():
78
- """Reset only document-specific state variables
79
-
80
- This function explicitly resets all document-related variables to ensure
81
- clean state between document processing, preventing cached data issues.
82
- """
83
- st.session_state.sample_document = None
84
- st.session_state.original_sample_bytes = None
85
- st.session_state.original_sample_name = None
86
- st.session_state.original_sample_mime_type = None
87
- st.session_state.is_sample_document = False
88
- st.session_state.processed_document_active = False
89
- st.session_state.sample_document_processed = False
90
- st.session_state.sample_just_loaded = False
91
- st.session_state.last_processed_file = None
92
- st.session_state.selected_previous_result = None
93
- # Keep temp_file_paths but ensure it's empty after cleanup
94
- if 'temp_file_paths' in st.session_state:
95
- st.session_state.temp_file_paths = []
96
-
97
- def init_session_state():
98
- """Initialize session state variables if they don't already exist
99
-
100
- This function follows Streamlit's recommended patterns for state initialization.
101
- It only creates variables if they don't exist yet and doesn't modify existing values.
102
- """
103
- # Initialize persistent app state variables
104
- if 'previous_results' not in st.session_state:
105
- st.session_state.previous_results = []
106
- if 'temp_file_paths' not in st.session_state:
107
- st.session_state.temp_file_paths = []
108
- if 'auto_process_sample' not in st.session_state:
109
- st.session_state.auto_process_sample = False
110
- if 'close_clicked' not in st.session_state:
111
- st.session_state.close_clicked = False
112
- if 'active_tab' not in st.session_state:
113
- st.session_state.active_tab = 0
114
-
115
- # Initialize document-specific state variables
116
- if 'last_processed_file' not in st.session_state:
117
- st.session_state.last_processed_file = None
118
- if 'sample_just_loaded' not in st.session_state:
119
- st.session_state.sample_just_loaded = False
120
- if 'processed_document_active' not in st.session_state:
121
- st.session_state.processed_document_active = False
122
- if 'sample_document_processed' not in st.session_state:
123
- st.session_state.sample_document_processed = False
124
- if 'sample_document' not in st.session_state:
125
- st.session_state.sample_document = None
126
- if 'original_sample_bytes' not in st.session_state:
127
- st.session_state.original_sample_bytes = None
128
- if 'original_sample_name' not in st.session_state:
129
- st.session_state.original_sample_name = None
130
- if 'is_sample_document' not in st.session_state:
131
- st.session_state.is_sample_document = False
132
- if 'selected_previous_result' not in st.session_state:
133
- st.session_state.selected_previous_result = None
134
 
135
- def close_document():
136
- """Called when the Close Document button is clicked
 
 
 
 
137
 
138
- This function handles proper cleanup of resources and state when closing a document.
139
- It uses Streamlit's callback mechanism which ensures the state change happens
140
- at the correct time in Streamlit's execution cycle.
 
141
 
142
- WARNING: Do not replace this with inline button handling using if st.button():
143
- That approach breaks Streamlit's execution flow and causes UI artifacts.
144
- """
145
- logger.info("Close document button clicked")
 
 
146
 
147
- # Clean up temp files first
148
- if 'temp_file_paths' in st.session_state and st.session_state.temp_file_paths:
149
- logger.info(f"Cleaning up {len(st.session_state.temp_file_paths)} temporary files")
150
- handle_temp_files(st.session_state.temp_file_paths)
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Reset all document-specific state variables to prevent caching issues
153
- reset_document_state()
 
 
154
 
155
- # Set flag for having cleaned up - this will trigger a rerun in main()
156
- st.session_state.close_clicked = True
157
 
158
- def show_example_documents():
159
- """Show example documents section"""
160
- st.header("Sample Documents")
161
-
162
- # Add a simplified info message about examples and CSS in the same markdown block
163
- # to reduce spacing between elements
164
- st.markdown("""
165
- This app can process various historical documents:
166
- - Historical photographs, maps, and manuscripts
167
- - Handwritten letters and documents
168
- - Printed books and articles
169
- - Multi-page PDFs
170
-
171
- <style>
172
- /* Make the selectbox container match the full column width */
173
- .main .block-container .element-container:has([data-testid="stSelectbox"]) {
174
- width: 100% !important;
175
- max-width: 100% !important;
176
- margin-top: -12px !important; /* Reduce space between text and selectbox */
177
- }
178
-
179
- /* Make the actual selectbox control take the full width */
180
- .stSelectbox > div > div {
181
- width: 100% !important;
182
- max-width: 100% !important;
183
- }
184
-
185
- /* Tighten spacing in the sample documents tab */
186
- .main .block-container [data-testid="stVerticalBlock"] > div:nth-child(n+2) {
187
- margin-top: 0.5rem !important;
188
- }
189
- </style>
190
- """, unsafe_allow_html=True)
191
 
192
- # Sample document URLs dropdown with clearer label
193
- sample_urls = [
194
- "Select a sample document",
195
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/a-la-carte.pdf",
196
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magician-or-bottle-cungerer.jpg",
197
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
198
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
199
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
200
- "https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/recipe.jpg",
201
- ]
202
-
203
- sample_names = [
204
- "Select a sample document",
205
- "Restaurant Menu (PDF)",
206
- "The Magician (Image)",
207
- "Handwritten Letter (Image)",
208
- "Magellan Travels (Image)",
209
- "Milgram Flier (Image)",
210
- "Historical Recipe (Image)"
211
- ]
212
-
213
- # Initialize sample_document in session state if it doesn't exist
214
- if 'sample_document' not in st.session_state:
215
- st.session_state.sample_document = None
216
 
217
- selected_sample = st.selectbox("Select a sample document from `~/input`", options=range(len(sample_urls)), format_func=lambda i: sample_names[i])
 
 
 
218
 
219
- if selected_sample > 0:
220
- selected_url = sample_urls[selected_sample]
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- # Add process button for the sample document with consistent styling
223
- if st.button("Load Sample Document", key="load_sample_btn"):
224
- try:
225
- import requests
226
- from io import BytesIO
227
-
228
- with st.spinner(f"Downloading {sample_names[selected_sample]}..."):
229
- response = requests.get(selected_url)
230
- response.raise_for_status()
231
-
232
- # Extract filename from URL
233
- file_name = selected_url.split("/")[-1]
234
-
235
- # Create a BytesIO object from the downloaded content
236
- file_content = BytesIO(response.content)
237
-
238
- # Store as a UploadedFile-like object in session state
239
- class SampleDocument:
240
- def __init__(self, name, content, content_type):
241
- self.name = name
242
- self._content = content
243
- self.type = content_type
244
- self.size = len(content)
245
-
246
- def getvalue(self):
247
- return self._content
248
-
249
- def read(self):
250
- return self._content
251
-
252
- def seek(self, position):
253
- # Implement seek for compatibility with some file operations
254
- return
255
-
256
- def tell(self):
257
- # Implement tell for compatibility
258
- return 0
259
-
260
- # Determine content type based on file extension
261
- if file_name.lower().endswith('.pdf'):
262
- content_type = 'application/pdf'
263
- elif file_name.lower().endswith(('.jpg', '.jpeg')):
264
- content_type = 'image/jpeg'
265
- elif file_name.lower().endswith('.png'):
266
- content_type = 'image/png'
267
- else:
268
- content_type = 'application/octet-stream'
269
-
270
- # Reset any document state before loading a new sample
271
- if st.session_state.processed_document_active:
272
- # Clean up any temporary files from previous processing
273
- if st.session_state.temp_file_paths:
274
- handle_temp_files(st.session_state.temp_file_paths)
275
-
276
- # Reset all document-specific state variables
277
- reset_document_state()
278
-
279
- # Save download info in session state
280
- st.session_state.sample_document = SampleDocument(
281
- name=file_name,
282
- content=response.content,
283
- content_type=content_type
284
- )
285
-
286
- # Store original bytes for reprocessing with proper MIME type handling
287
- st.session_state.original_sample_bytes = response.content
288
- st.session_state.original_sample_name = file_name
289
- st.session_state.original_sample_mime_type = content_type
290
-
291
- # Set state flags
292
- st.session_state.sample_just_loaded = True
293
- st.session_state.is_sample_document = True
294
- # Generate a unique identifier for the sample document
295
- st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
296
-
297
- # Set a flag to show redirect message
298
- st.session_state.redirect_to_processing = True
299
- st.rerun()
300
- except Exception as e:
301
- st.error(f"Error downloading sample document: {str(e)}")
302
- st.info("Please try uploading your own document instead.")
303
- else:
304
- # If no sample is selected, clear the sample document in session state
305
- st.session_state.sample_document = None
306
-
307
- def process_document(uploaded_file, left_col, right_col, sidebar_options):
308
- """Process the uploaded document and display results"""
309
- if uploaded_file is None:
310
- return
311
 
312
- # Check file size (cap at 50MB)
313
- file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
314
-
315
- if file_size_mb > MAX_FILE_SIZE_MB:
316
- with left_col:
317
- st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is {MAX_FILE_SIZE_MB}MB.")
318
- return
319
 
320
- # Check if this is a new file (different from the last processed file)
321
- current_file_identifier = f"{uploaded_file.name}_{len(uploaded_file.getvalue())}"
322
-
323
- # Make sure last_processed_file is initialized
324
- if 'last_processed_file' not in st.session_state:
325
- st.session_state.last_processed_file = None
326
 
327
- if st.session_state.last_processed_file != current_file_identifier:
328
- # Reset processed_document_active if a new file is uploaded
329
- st.session_state.processed_document_active = False
330
-
331
- # Process button - flush left with similar padding as file browser
332
- with left_col:
333
- # Create a process button with minimal spacing to the uploader
334
- st.markdown('<div style="padding: 0.2rem 0; min-width: 170px; margin-top: -10px; overflow: visible;">', unsafe_allow_html=True)
335
- process_button = st.button("Process Document", key="process_document_btn")
336
- st.markdown('</div>', unsafe_allow_html=True)
337
-
338
- # Handle sample document recreation if needed
339
- if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
340
- # Recreate the uploaded file from stored bytes
341
- from io import BytesIO
342
- import mimetypes
343
-
344
- # Determine mime type based on file extension
345
- file_ext = os.path.splitext(st.session_state.original_sample_name)[1].lower()
346
- if file_ext == '.pdf':
347
- mime_type = 'application/pdf'
348
- elif file_ext in ['.jpg', '.jpeg']:
349
- mime_type = 'image/jpeg'
350
- elif file_ext == '.png':
351
- mime_type = 'image/png'
352
- else:
353
- mime_type = mimetypes.guess_type(st.session_state.original_sample_name)[0] or 'application/octet-stream'
354
 
355
- # Create a synthetic file-like object with the same interface as UploadedFile
356
- uploaded_file = type('obj', (object,), {
357
- 'name': st.session_state.original_sample_name,
358
- 'getvalue': lambda: st.session_state.original_sample_bytes,
359
- 'read': lambda: st.session_state.original_sample_bytes,
360
- 'seek': lambda x: None,
361
- 'type': mime_type
362
- })
363
 
364
- # Empty container for progress indicators - will be filled during processing
365
- # Positioned right after the process button for better visibility
366
- progress_placeholder = st.empty()
367
 
368
- # Image preprocessing preview - show if image file and preprocessing options are set
369
- # Remove the document active check to show preview immediately after selection
370
- if (any(sidebar_options["preprocessing_options"].values()) and
371
- uploaded_file.type.startswith('image/')):
372
-
373
- st.markdown("**Preprocessed Preview**")
374
- try:
375
- # Create a container for the preview
376
- with st.container():
377
- processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
378
- # Convert image to base64 and display as HTML to avoid fullscreen button
379
- img_data = base64.b64encode(processed_bytes).decode()
380
- img_html = f'<img src="data:image/jpeg;base64,{img_data}" style="width:100%; border-radius:4px;">'
381
- st.markdown(img_html, unsafe_allow_html=True)
382
-
383
- # Show preprocessing metadata in a well-formatted caption
384
- meta_items = []
385
- # Only include document type in the list if actual preprocessing is applied
386
- has_active_preprocessing = (
387
- sidebar_options["preprocessing_options"].get("grayscale", False) or
388
- sidebar_options["preprocessing_options"].get("denoise", False) or
389
- sidebar_options["preprocessing_options"].get("contrast", 0) != 0 or
390
- sidebar_options["preprocessing_options"].get("rotation", 0) != 0
391
- )
392
-
393
- # Only show document type if there's actual preprocessing being applied
394
- if has_active_preprocessing and sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
395
- meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
396
- if sidebar_options["preprocessing_options"].get("grayscale", False):
397
- meta_items.append("Grayscale")
398
- if sidebar_options["preprocessing_options"].get("denoise", False):
399
- meta_items.append("Denoise")
400
- if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
401
- meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
402
- if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
403
- meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
404
-
405
- # Only show "Applied:" if there are actual preprocessing steps
406
- if meta_items:
407
- meta_text = "Applied: " + ", ".join(meta_items)
408
- st.caption(meta_text)
409
- except Exception as e:
410
- st.error(f"Error in preprocessing: {str(e)}")
411
- st.info("Try using grayscale preprocessing for PNG images with transparency")
412
 
413
- # Container for success message (will be filled after processing)
414
- metadata_placeholder = st.empty()
415
-
416
- # Check if this is an auto-processing situation
417
- auto_processing = st.session_state.auto_process_sample and not st.session_state.processed_document_active
418
 
419
- # Show a message if auto-processing is happening
420
- auto_processing_message = st.empty()
421
- if auto_processing:
422
- auto_processing_message.info("Automatically processing sample document...")
423
 
424
- # Determine if we should process the document
425
- # Either process button was clicked OR auto-processing is happening
426
- should_process = process_button or auto_processing
427
-
428
- if should_process:
429
- # Reset auto-process flag to avoid processing on next rerun
430
- if st.session_state.auto_process_sample:
431
- st.session_state.auto_process_sample = False
432
-
433
- # Move the progress indicator reference to just below the button
434
- progress_reporter = ProgressReporter(progress_placeholder).setup()
435
 
436
- try:
437
- # Process the document, capturing both result and temp file paths
438
- # Modified to pass existing temp_file_paths to avoid resource leaks
439
- existing_temp_paths = []
440
- if 'temp_file_paths' in st.session_state:
441
- existing_temp_paths = st.session_state.temp_file_paths
442
-
443
- result = process_file(
444
- uploaded_file=uploaded_file,
445
- use_vision=sidebar_options["use_vision"],
446
- preprocessing_options=sidebar_options["preprocessing_options"],
447
- progress_reporter=progress_reporter,
448
- pdf_dpi=sidebar_options.get("pdf_dpi", 150),
449
- max_pages=sidebar_options.get("max_pages", 3),
450
- pdf_rotation=sidebar_options.get("pdf_rotation", 0),
451
- custom_prompt=sidebar_options.get("custom_prompt", ""),
452
- perf_mode=sidebar_options.get("perf_mode", "Quality"),
453
- use_segmentation=sidebar_options.get("use_segmentation", False)
454
- )
455
-
456
- # Ensure temp_file_paths in session state is updated with any new paths
457
- # This is critical for proper resource cleanup when document is closed
458
- if 'has_images' in result and result['has_images']:
459
- logger.info("Document has images, ensuring temp files are tracked")
460
- if 'temp_file_paths' not in st.session_state:
461
- st.session_state.temp_file_paths = []
462
-
463
- # Handle text-only OCR results (like the Milgram flier)
464
- if ('ocr_contents' in result and
465
- 'raw_text' in result['ocr_contents'] and
466
- len(result['ocr_contents']) <= 2 and # Only raw_text and possibly one other field
467
- 'has_images' not in result):
468
- logger.info("Text-only OCR detected, handling as special case")
469
- # Ensure raw_text is properly formatted as markdown
470
- raw_text = result['ocr_contents']['raw_text']
471
- # If we don't have other structured content, set a placeholder title
472
- if 'title' not in result['ocr_contents']:
473
- result['ocr_contents']['title'] = "Document Text"
474
-
475
- # Display success message at the top of results, before any previews
476
- with left_col:
477
- # First show the success message (full width)
478
- st.success("**Document processed successfully**")
479
-
480
- # Then show the close button (also full width, positioned to left)
481
- st.button("Close Document",
482
- key="close_document_btn",
483
- type="secondary",
484
- on_click=close_document)
485
-
486
- # Add a small spacer
487
- st.markdown("<div style='height: 10px;'></div>", unsafe_allow_html=True)
488
-
489
- # Display results
490
- display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
491
-
492
- # Set processed_document_active to True when a new document is processed
493
- st.session_state.processed_document_active = True
494
-
495
- # Clear the auto-processing message
496
- auto_processing_message.empty()
497
-
498
- # Store information about this processed file to track when new files are uploaded
499
- if uploaded_file is not None:
500
- st.session_state.last_processed_file = current_file_identifier
501
-
502
- # Store the result in the previous results list
503
- # Add timestamp to result for history tracking
504
- result_copy = result.copy()
505
- result_copy['timestamp'] = format_timestamp()
506
-
507
- # Store if this was a sample document
508
- if 'is_sample_document' in st.session_state and st.session_state.is_sample_document:
509
- result_copy['sample_document'] = True
510
-
511
- # Add to session state, keeping the most recent 20 results
512
- st.session_state.previous_results.insert(0, result_copy)
513
- if len(st.session_state.previous_results) > 20:
514
- st.session_state.previous_results = st.session_state.previous_results[:20]
515
-
516
- except Exception as e:
517
- st.error(f"Error processing document: {str(e)}")
518
-
519
- # Log the error
520
- import logging
521
- logging.error(f"Document processing error: {str(e)}", exc_info=True)
522
 
523
- def main():
524
- """Main application function"""
525
- # Initialize session state
526
- init_session_state()
527
 
528
- # Handle any required cleanup at the start of execution
529
- # CRITICAL: This two-phase state cleanup pattern is essential for Streamlit's execution model.
530
- # When close_clicked is True, we need to restart the app's execution with a clean slate.
531
- # DO NOT REMOVE OR MODIFY this pattern as it ensures proper UI cleanup.
532
- if st.session_state.get('close_clicked', False):
533
- # Reset the flag - cleanup has been handled
534
- st.session_state.close_clicked = False
535
- # Don't do anything else in this run - force a clean restart
536
- st.rerun()
537
 
538
- # Initialize new flag for redirecting to processing tab
539
- if 'redirect_to_processing' not in st.session_state:
540
- st.session_state.redirect_to_processing = False
 
 
 
 
 
 
 
 
 
541
 
542
- # Apply custom CSS
543
- from ui.layout import load_css
544
- load_css()
 
 
 
 
 
 
 
 
 
545
 
546
- # Create sidebar options
547
- sidebar_options = create_sidebar_options()
 
 
 
 
 
 
 
 
 
 
548
 
549
- # Create main layout with tabs - simpler, more compact approach
550
- tab_names = ["Document Processing", "Sample Documents", "Learn More"]
551
- main_tab1, main_tab2, main_tab3 = st.tabs(tab_names)
552
 
553
- with main_tab1:
554
- # Create a two-column layout for file upload and results with minimal padding
555
- st.markdown('<style>.block-container{padding-top: 1rem; padding-bottom: 0;}</style>', unsafe_allow_html=True)
556
- # Using a 2:3 column ratio gives more space to the results column
557
- left_col, right_col = st.columns([2, 3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
 
559
- with left_col:
560
- # Create file uploader
561
- uploaded_file = create_file_uploader()
562
-
563
- # If a real file is uploaded, clear any sample document
564
- if uploaded_file is not None and 'sample_document' in st.session_state:
565
- st.session_state.sample_document = None
566
- st.session_state.is_sample_document = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
 
568
- # Check if we have a sample document loaded (only if no real file uploaded)
569
- elif ('sample_document' in st.session_state and
570
- st.session_state.sample_document is not None):
571
 
572
- # Use the sample document instead of the uploaded file
573
- uploaded_file = st.session_state.sample_document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
 
575
- # Just reset the sample document loading flags after it's been used
576
- if st.session_state.sample_just_loaded:
577
- st.session_state.sample_just_loaded = False
578
- st.session_state.sample_document_processed = True
579
- st.session_state.auto_process_sample = True
580
-
581
- # Only process document if available
582
- if uploaded_file is not None:
583
- process_document(uploaded_file, left_col, right_col, sidebar_options)
584
-
585
- with main_tab2:
586
- # Sample Documents tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
 
588
- # Show redirect message if a sample was just loaded
589
- if st.session_state.get('redirect_to_processing', False):
590
- st.success("**Sample document loaded!** Please switch to the **Document Processing** tab to view and process it.")
591
- # Clear the flag after showing the message
592
- st.session_state.redirect_to_processing = False
 
 
 
 
 
 
593
 
594
- show_example_documents()
595
-
596
- # Previous results tab temporarily removed
597
-
598
- with main_tab3:
599
- # About tab
600
- display_about_tab()
601
-
602
- # Run the application
603
- if __name__ == "__main__":
604
- main()
 
 
1
  import os
2
+ import streamlit as st
3
  import json
4
  import sys
5
  import time
 
 
 
6
  from pathlib import Path
7
+ import tempfile
8
+ import io
9
+ from pdf2image import convert_from_bytes
10
+ from PIL import Image, ImageEnhance, ImageFilter
11
+ import cv2
12
+ import numpy as np
 
 
 
13
 
14
+ # Import the StructuredOCR class and config from the local files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  from structured_ocr import StructuredOCR
16
  from config import MISTRAL_API_KEY
 
17
 
18
+ # Check for modular UI components
19
+ try:
20
+ from ui.layout import tool_container, key_concept, research_question
21
+ MODULAR_UI = True
22
+ except ImportError:
23
+ MODULAR_UI = False
24
 
25
  # Set page configuration
26
  st.set_page_config(
27
  page_title="Historical OCR",
28
+ page_icon="📜",
29
  layout="wide",
30
+ initial_sidebar_state="expanded"
31
  )
32
 
33
+ # Enable caching for expensive operations
34
+ @st.cache_data(ttl=3600, show_spinner=False)
35
+ def convert_pdf_to_images(pdf_bytes, dpi=150):
36
+ """Convert PDF bytes to a list of images with caching"""
37
+ try:
38
+ return convert_from_bytes(pdf_bytes, dpi=dpi)
39
+ except Exception as e:
40
+ st.error(f"Error converting PDF: {str(e)}")
41
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ @st.cache_data(ttl=3600, show_spinner=False)
44
+ def preprocess_image(image_bytes, preprocessing_options):
45
+ """Preprocess image with selected options"""
46
+ # Convert bytes to OpenCV format
47
+ image = Image.open(io.BytesIO(image_bytes))
48
+ img_array = np.array(image)
49
 
50
+ # Apply preprocessing based on selected options
51
+ if preprocessing_options.get("grayscale", False):
52
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
53
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
54
 
55
+ if preprocessing_options.get("contrast", 0) != 0:
56
+ contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
57
+ image = Image.fromarray(img_array)
58
+ enhancer = ImageEnhance.Contrast(image)
59
+ image = enhancer.enhance(contrast_factor)
60
+ img_array = np.array(image)
61
 
62
+ if preprocessing_options.get("denoise", False):
63
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
64
+
65
+ if preprocessing_options.get("threshold", False):
66
+ # Convert to grayscale if not already
67
+ if len(img_array.shape) == 3:
68
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
69
+ else:
70
+ gray = img_array
71
+ # Apply adaptive threshold
72
+ binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
73
+ cv2.THRESH_BINARY, 11, 2)
74
+ # Convert back to RGB
75
+ img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
76
+
77
+ # Convert back to PIL Image
78
+ processed_image = Image.fromarray(img_array)
79
 
80
+ # Convert to bytes
81
+ byte_io = io.BytesIO()
82
+ processed_image.save(byte_io, format='PNG')
83
+ byte_io.seek(0)
84
 
85
+ return byte_io.getvalue()
 
86
 
87
+ # Define functions
88
+ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
89
+ """Process the uploaded file and return the OCR results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ Args:
92
+ uploaded_file: The uploaded file to process
93
+ use_vision: Whether to use vision model
94
+ preprocessing_options: Dictionary of preprocessing options
95
+ """
96
+ if preprocessing_options is None:
97
+ preprocessing_options = {}
98
+
99
+ # Show progress indicator
100
+ progress_bar = st.progress(0)
101
+ status_text = st.empty()
102
+ status_text.text("Preparing file for processing...")
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # Save the uploaded file to a temporary file
105
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
106
+ tmp.write(uploaded_file.getvalue())
107
+ temp_path = tmp.name
108
 
109
+ try:
110
+ # Check if API key is available
111
+ if not MISTRAL_API_KEY:
112
+ # Return dummy data if no API key
113
+ progress_bar.progress(100)
114
+ status_text.empty()
115
+ return {
116
+ "file_name": uploaded_file.name,
117
+ "topics": ["Sample Document"],
118
+ "languages": ["English"],
119
+ "ocr_contents": {
120
+ "title": "Sample Document",
121
+ "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
122
+ }
123
+ }
124
 
125
+ # Update progress
126
+ progress_bar.progress(20)
127
+ status_text.text("Initializing OCR processor...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ # Initialize OCR processor
130
+ processor = StructuredOCR()
 
 
 
 
 
131
 
132
+ # Determine file type from extension
133
+ file_ext = Path(uploaded_file.name).suffix.lower()
134
+ file_type = "pdf" if file_ext == ".pdf" else "image"
 
 
 
135
 
136
+ # Apply preprocessing if needed
137
+ if any(preprocessing_options.values()) and file_type == "image":
138
+ status_text.text("Applying image preprocessing...")
139
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ # Save processed image to temp file
142
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
143
+ proc_tmp.write(processed_bytes)
144
+ temp_path = proc_tmp.name
 
 
 
 
145
 
146
+ # Get file size in MB
147
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
 
148
 
149
+ # Check if file exceeds API limits (50 MB)
150
+ if file_size_mb > 50:
151
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
152
+ return {
153
+ "file_name": uploaded_file.name,
154
+ "topics": ["Document"],
155
+ "languages": ["English"],
156
+ "confidence_score": 0.0,
157
+ "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
158
+ "ocr_contents": {
159
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
160
+ "partial_text": "Document could not be processed due to size limitations."
161
+ }
162
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
+ # Update progress
165
+ progress_bar.progress(40)
166
+ status_text.text("Processing document with OCR...")
 
 
167
 
168
+ # Process the file with file size information for automatic page limiting
169
+ # Make sure we're using the latest mistral-ocr model
170
+ # See https://docs.mistral.ai/capabilities/document/ for more info
171
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
172
 
173
+ # Complete progress
174
+ progress_bar.progress(100)
175
+ status_text.empty()
 
 
 
 
 
 
 
 
176
 
177
+ return result
178
+ except Exception as e:
179
+ progress_bar.progress(100)
180
+ status_text.empty()
181
+ st.error(f"Error during processing: {str(e)}")
182
+ raise
183
+ finally:
184
+ # Clean up the temporary file
185
+ if os.path.exists(temp_path):
186
+ os.unlink(temp_path)
187
+
188
+ # App title and description
189
+ st.title("Historical Document OCR")
190
+ st.subheader("Powered by Mistral AI")
191
+
192
+ # Create main layout with tabs and columns
193
+ main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ with main_tab1:
196
+ # Create a two-column layout for file upload and preview
197
+ upload_col, preview_col = st.columns([1, 1])
 
198
 
199
+ # File uploader in the left column
200
+ with upload_col:
201
+ st.markdown("""
202
+ Upload an image or PDF file to get started.
 
 
 
 
 
203
 
204
+ Using the latest `mistral-ocr-latest` model for advanced document understanding.
205
+ """)
206
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 50MB per file")
207
+
208
+ # Sidebar with options
209
+ with st.sidebar:
210
+ st.header("Options")
211
+
212
+ # Model options
213
+ st.subheader("Model Settings")
214
+ use_vision = st.checkbox("Use Vision Model", value=True,
215
+ help="For image files, use the vision model for improved analysis (may be slower)")
216
 
217
+ # Image preprocessing options (collapsible)
218
+ st.subheader("Image Preprocessing")
219
+ with st.expander("Preprocessing Options"):
220
+ preprocessing_options = {}
221
+ preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
222
+ help="Convert image to grayscale before OCR")
223
+ preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
224
+ help="Apply adaptive thresholding to enhance text")
225
+ preprocessing_options["denoise"] = st.checkbox("Denoise Image",
226
+ help="Remove noise from the image")
227
+ preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
228
+ help="Adjust image contrast (-5 to +5)")
229
 
230
+ # PDF options (collapsible)
231
+ st.subheader("PDF Options")
232
+ with st.expander("PDF Settings"):
233
+ pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
234
+ help="Higher DPI gives better quality but slower processing")
235
+ max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
236
+ help="Limit number of pages to process")
237
+
238
+ # About tab content
239
+ with main_tab2:
240
+ st.markdown("""
241
+ ### About This Application
242
 
243
+ This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
 
 
244
 
245
+ It can process:
246
+ - Image files (jpg, png, etc.)
247
+ - PDF documents (multi-page support)
248
+
249
+ The extracted content is processed into structured data based on the document type, combining:
250
+ - Text extraction with `mistral-ocr-latest`
251
+ - Analysis with language models
252
+ - Layout preservation with images
253
+
254
+ View results in three formats:
255
+ - Structured HTML view
256
+ - Raw JSON (for developers)
257
+ - Markdown with images (preserves document layout)
258
+
259
+ **New Features:**
260
+ - Image preprocessing for better OCR quality
261
+ - PDF resolution and page controls
262
+ - Progress tracking during processing
263
+ """)
264
+
265
+ with main_tab1:
266
+ if uploaded_file is not None:
267
+ # Check file size (cap at 50MB)
268
+ file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
269
 
270
+ if file_size_mb > 50:
271
+ with upload_col:
272
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
273
+ st.stop()
274
+
275
+ file_ext = Path(uploaded_file.name).suffix.lower()
276
+
277
+ # Display document preview in preview column
278
+ with preview_col:
279
+ st.subheader("Document Preview")
280
+ if file_ext == ".pdf":
281
+ try:
282
+ # Convert first page of PDF to image for preview
283
+ pdf_bytes = uploaded_file.getvalue()
284
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
285
+
286
+ if images:
287
+ # Convert PIL image to bytes for Streamlit
288
+ first_page = images[0]
289
+ img_bytes = io.BytesIO()
290
+ first_page.save(img_bytes, format='JPEG')
291
+ img_bytes.seek(0)
292
+
293
+ # Display the PDF preview
294
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
295
+ else:
296
+ st.info(f"PDF uploaded: {uploaded_file.name}")
297
+ except Exception:
298
+ # Simply show the file name without an error message
299
+ st.info(f"PDF uploaded: {uploaded_file.name}")
300
+ st.info("Click 'Process Document' to analyze the content.")
301
+ else:
302
+ st.image(uploaded_file, use_container_width=True)
303
+
304
+ # Add image preprocessing preview in a collapsible section if needed
305
+ if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
306
+ with st.expander("Image Preprocessing Preview"):
307
+ preview_cols = st.columns(2)
308
 
309
+ with preview_cols[0]:
310
+ st.markdown("**Original Image**")
311
+ st.image(uploaded_file, use_container_width=True)
312
 
313
+ with preview_cols[1]:
314
+ st.markdown("**Preprocessed Image**")
315
+ try:
316
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
317
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
318
+ except Exception as e:
319
+ st.error(f"Error in preprocessing: {str(e)}")
320
+
321
+ # Process button - flush left with similar padding as file browser
322
+ with upload_col:
323
+ process_button = st.button("Process Document", use_container_width=True)
324
+
325
+ # Results section
326
+ if process_button:
327
+ try:
328
+ # Get max_pages or default if not available
329
+ max_pages_value = max_pages if 'max_pages' in locals() else None
330
 
331
+ # Call process_file with all options
332
+ result = process_file(uploaded_file, use_vision, preprocessing_options)
333
+
334
+ # Create results tabs for better organization
335
+ results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
336
+
337
+ with results_tab1:
338
+ # Create two columns for metadata and content
339
+ meta_col, content_col = st.columns([1, 2])
340
+
341
+ with meta_col:
342
+ st.subheader("Document Metadata")
343
+ st.success("**Document processed successfully**")
344
+
345
+ # Display file info
346
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
347
+
348
+ # Display info if only limited pages were processed
349
+ if 'limited_pages' in result:
350
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
351
+
352
+ # Display languages if available
353
+ if 'languages' in result:
354
+ languages = [lang for lang in result['languages'] if lang is not None]
355
+ if languages:
356
+ st.write(f"**Languages:** {', '.join(languages)}")
357
+
358
+ # Confidence score if available
359
+ if 'confidence_score' in result:
360
+ confidence = result['confidence_score']
361
+ st.write(f"**OCR Confidence:** {confidence:.1%}")
362
+
363
+ # Display topics if available
364
+ if 'topics' in result and result['topics']:
365
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
366
+
367
+ with content_col:
368
+ st.subheader("Document Contents")
369
+ if 'ocr_contents' in result:
370
+ # Check if there are images in the OCR result
371
+ has_images = False
372
+ if 'raw_response' in result:
373
+ try:
374
+ has_images = any(page.images for page in result['raw_response'].pages)
375
+ except Exception:
376
+ has_images = False
377
+
378
+ # Create tabs for different views
379
+ if has_images:
380
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
381
+ else:
382
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
383
+
384
+ with view_tab1:
385
+ # Display in a more user-friendly format based on the content structure
386
+ html_content = ""
387
+ if isinstance(result['ocr_contents'], dict):
388
+ for section, content in result['ocr_contents'].items():
389
+ if content: # Only display non-empty sections
390
+ section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
391
+ html_content += section_title
392
+
393
+ if isinstance(content, str):
394
+ html_content += f"<p>{content}</p>"
395
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
396
+ st.markdown(content)
397
+ elif isinstance(content, list):
398
+ html_list = "<ul>"
399
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
400
+ for item in content:
401
+ if isinstance(item, str):
402
+ html_list += f"<li>{item}</li>"
403
+ st.markdown(f"- {item}")
404
+ elif isinstance(item, dict):
405
+ html_list += f"<li>{json.dumps(item)}</li>"
406
+ st.json(item)
407
+ html_list += "</ul>"
408
+ html_content += html_list
409
+ elif isinstance(content, dict):
410
+ html_dict = "<dl>"
411
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
412
+ for k, v in content.items():
413
+ html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
414
+ st.markdown(f"**{k}:** {v}")
415
+ html_dict += "</dl>"
416
+ html_content += html_dict
417
+
418
+ # Add download button in a smaller section
419
+ with st.expander("Export Content"):
420
+ # Alternative download button
421
+ html_bytes = html_content.encode()
422
+ st.download_button(
423
+ label="Download as HTML",
424
+ data=html_bytes,
425
+ file_name="document_content.html",
426
+ mime="text/html"
427
+ )
428
+
429
+ with view_tab2:
430
+ # Show the raw JSON for developers
431
+ st.json(result)
432
+
433
+ if has_images:
434
+ with view_tab3:
435
+ # Show loading indicator while preparing images
436
+ with st.spinner("Preparing document with embedded images..."):
437
+ try:
438
+ # Import function
439
+ try:
440
+ from ocr_utils import get_combined_markdown
441
+ except ImportError:
442
+ st.error("Required module ocr_utils not found.")
443
+ st.stop()
444
+
445
+ # Check if raw_response is available
446
+ if 'raw_response' not in result:
447
+ st.warning("Raw OCR response not available. Cannot display images.")
448
+ st.stop()
449
+
450
+ # Validate the raw_response structure before processing
451
+ if not hasattr(result['raw_response'], 'pages'):
452
+ st.warning("Invalid OCR response format. Cannot display images.")
453
+ st.stop()
454
+
455
+ # Get the combined markdown with images
456
+ # Set a flag to compress images if needed
457
+ compress_images = True
458
+ max_image_width = 800 # Maximum width for images
459
+
460
+ try:
461
+ # First try to get combined markdown with compressed images
462
+ if compress_images and hasattr(result['raw_response'], 'pages'):
463
+ from ocr_utils import get_combined_markdown_compressed
464
+ combined_markdown = get_combined_markdown_compressed(
465
+ result['raw_response'],
466
+ max_width=max_image_width,
467
+ quality=85
468
+ )
469
+ else:
470
+ # Fall back to regular method if compression not available
471
+ combined_markdown = get_combined_markdown(result['raw_response'])
472
+ except (ImportError, AttributeError):
473
+ # Fall back to regular method
474
+ combined_markdown = get_combined_markdown(result['raw_response'])
475
+
476
+ if not combined_markdown or combined_markdown.strip() == "":
477
+ st.warning("No image content found in the document.")
478
+ st.stop()
479
+
480
+ # Check if there are many images that might cause loading issues
481
+ image_count = sum(len(page.images) for page in result['raw_response'].pages if hasattr(page, 'images'))
482
+
483
+ # Add warning for image-heavy documents
484
+ if image_count > 10:
485
+ st.warning(f"This document contains {image_count} images. Rendering may take longer than usual.")
486
+
487
+ # Add CSS to ensure proper spacing and handling of text and images
488
+ st.markdown("""
489
+ <style>
490
+ .markdown-text-container {
491
+ padding: 10px;
492
+ background-color: #f9f9f9;
493
+ border-radius: 5px;
494
+ }
495
+ .markdown-text-container img {
496
+ margin: 15px 0;
497
+ max-width: 100%;
498
+ border: 1px solid #ddd;
499
+ border-radius: 4px;
500
+ display: block;
501
+ }
502
+ .markdown-text-container p {
503
+ margin-bottom: 16px;
504
+ line-height: 1.6;
505
+ }
506
+ /* Add lazy loading for images to improve performance */
507
+ .markdown-text-container img {
508
+ loading: lazy;
509
+ }
510
+ </style>
511
+ """, unsafe_allow_html=True)
512
+
513
+ # For very image-heavy documents, show images in a paginated way
514
+ if image_count > 20:
515
+ # Show image content in a paginated way
516
+ st.write("Document contains many images. Showing in a paginated format:")
517
+
518
+ # Split the combined markdown by page separators
519
+ pages = combined_markdown.split("---")
520
+
521
+ # Create a page selector
522
+ page_num = st.selectbox("Select page to view:",
523
+ options=list(range(1, len(pages)+1)),
524
+ index=0)
525
+
526
+ # Display only the selected page
527
+ st.markdown(f"""
528
+ <div class="markdown-text-container">
529
+ {pages[page_num-1]}
530
+ </div>
531
+ """, unsafe_allow_html=True)
532
+
533
+ # Add note about pagination
534
+ st.info(f"Showing page {page_num} of {len(pages)}. Select a different page from the dropdown above.")
535
+ else:
536
+ # Wrap the markdown in a div with the class for styling
537
+ st.markdown(f"""
538
+ <div class="markdown-text-container">
539
+ {combined_markdown}
540
+ </div>
541
+ """, unsafe_allow_html=True)
542
+
543
+ # Add a download button for the combined content
544
+ st.download_button(
545
+ label="Download with Images (HTML)",
546
+ data=f"""
547
+ <html>
548
+ <head>
549
+ <style>
550
+ body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
551
+ img {{ max-width: 100%; margin: 15px 0; }}
552
+ </style>
553
+ </head>
554
+ <body>
555
+ {combined_markdown}
556
+ </body>
557
+ </html>
558
+ """,
559
+ file_name="document_with_images.html",
560
+ mime="text/html"
561
+ )
562
+
563
+ except Exception as e:
564
+ st.error(f"Could not display document with images: {str(e)}")
565
+ st.info("Try refreshing or processing the document again.")
566
+ else:
567
+ st.error("No OCR content was extracted from the document.")
568
+
569
+ with results_tab2:
570
+ st.subheader("Raw Processing Results")
571
+ st.json(result)
572
+
573
+ except Exception as e:
574
+ st.error(f"Error processing document: {str(e)}")
575
+ else:
576
+ # Display sample images in the main area when no file is uploaded
577
+ st.info("Upload a document to get started using the file uploader above.")
578
+
579
+ # Show example images in a grid
580
+ st.subheader("Example Documents")
581
 
582
+ # Add a sample images container
583
+ with st.container():
584
+ # Find sample images from the input directory to display
585
+ input_dir = Path(__file__).parent / "input"
586
+ sample_images = []
587
+ if input_dir.exists():
588
+ # Find valid jpg files (with size > 50KB to avoid placeholders)
589
+ sample_images = [
590
+ path for path in input_dir.glob("*.jpg")
591
+ if path.stat().st_size > 50000
592
+ ][:3] # Limit to 3 samples
593
 
594
+ if sample_images:
595
+ columns = st.columns(3)
596
+ for i, img_path in enumerate(sample_images):
597
+ with columns[i % 3]:
598
+ try:
599
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
600
+ except Exception as e:
601
+ st.error(f"Error loading image {img_path.name}: {str(e)}")
 
 
 
backup/app.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import json
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+ import tempfile
8
+ import io
9
+ from pdf2image import convert_from_bytes
10
+ from PIL import Image, ImageEnhance, ImageFilter
11
+ import cv2
12
+ import numpy as np
13
+
14
+ # Import the StructuredOCR class and config from the local files
15
+ from structured_ocr import StructuredOCR
16
+ from config import MISTRAL_API_KEY
17
+
18
+ # Set page configuration
19
+ st.set_page_config(
20
+ page_title="Historical OCR",
21
+ page_icon="🚀",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded"
24
+ )
25
+
26
+ # Enable caching for expensive operations
27
+ @st.cache_data(ttl=3600, show_spinner=False)
28
+ def convert_pdf_to_images(pdf_bytes, dpi=150):
29
+ """Convert PDF bytes to a list of images with caching"""
30
+ try:
31
+ return convert_from_bytes(pdf_bytes, dpi=dpi)
32
+ except Exception as e:
33
+ st.error(f"Error converting PDF: {str(e)}")
34
+ return []
35
+
36
+ @st.cache_data(ttl=3600, show_spinner=False)
37
+ def preprocess_image(image_bytes, preprocessing_options):
38
+ """Preprocess image with selected options"""
39
+ # Convert bytes to OpenCV format
40
+ image = Image.open(io.BytesIO(image_bytes))
41
+ img_array = np.array(image)
42
+
43
+ # Apply preprocessing based on selected options
44
+ if preprocessing_options.get("grayscale", False):
45
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
46
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
47
+
48
+ if preprocessing_options.get("contrast", 0) != 0:
49
+ contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
50
+ image = Image.fromarray(img_array)
51
+ enhancer = ImageEnhance.Contrast(image)
52
+ image = enhancer.enhance(contrast_factor)
53
+ img_array = np.array(image)
54
+
55
+ if preprocessing_options.get("denoise", False):
56
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
57
+
58
+ if preprocessing_options.get("threshold", False):
59
+ # Convert to grayscale if not already
60
+ if len(img_array.shape) == 3:
61
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
62
+ else:
63
+ gray = img_array
64
+ # Apply adaptive threshold
65
+ binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
66
+ cv2.THRESH_BINARY, 11, 2)
67
+ # Convert back to RGB
68
+ img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
69
+
70
+ # Convert back to PIL Image
71
+ processed_image = Image.fromarray(img_array)
72
+
73
+ # Convert to bytes
74
+ byte_io = io.BytesIO()
75
+ processed_image.save(byte_io, format='PNG')
76
+ byte_io.seek(0)
77
+
78
+ return byte_io.getvalue()
79
+
80
+ # Define functions
81
+ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
82
+ """Process the uploaded file and return the OCR results
83
+
84
+ Args:
85
+ uploaded_file: The uploaded file to process
86
+ use_vision: Whether to use vision model
87
+ preprocessing_options: Dictionary of preprocessing options
88
+ """
89
+ if preprocessing_options is None:
90
+ preprocessing_options = {}
91
+
92
+ # Show progress indicator
93
+ progress_bar = st.progress(0)
94
+ status_text = st.empty()
95
+ status_text.text("Preparing file for processing...")
96
+
97
+ # Save the uploaded file to a temporary file
98
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
99
+ tmp.write(uploaded_file.getvalue())
100
+ temp_path = tmp.name
101
+
102
+ try:
103
+ # Check if API key is available
104
+ if not MISTRAL_API_KEY:
105
+ # Return dummy data if no API key
106
+ progress_bar.progress(100)
107
+ status_text.empty()
108
+ return {
109
+ "file_name": uploaded_file.name,
110
+ "topics": ["Sample Document"],
111
+ "languages": ["English"],
112
+ "ocr_contents": {
113
+ "title": "Sample Document",
114
+ "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
115
+ }
116
+ }
117
+
118
+ # Update progress
119
+ progress_bar.progress(20)
120
+ status_text.text("Initializing OCR processor...")
121
+
122
+ # Initialize OCR processor
123
+ processor = StructuredOCR()
124
+
125
+ # Determine file type from extension
126
+ file_ext = Path(uploaded_file.name).suffix.lower()
127
+ file_type = "pdf" if file_ext == ".pdf" else "image"
128
+
129
+ # Apply preprocessing if needed
130
+ if any(preprocessing_options.values()) and file_type == "image":
131
+ status_text.text("Applying image preprocessing...")
132
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
133
+
134
+ # Save processed image to temp file
135
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
136
+ proc_tmp.write(processed_bytes)
137
+ temp_path = proc_tmp.name
138
+
139
+ # Get file size in MB
140
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
141
+
142
+ # Check if file exceeds API limits (50 MB)
143
+ if file_size_mb > 50:
144
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
145
+ return {
146
+ "file_name": uploaded_file.name,
147
+ "topics": ["Document"],
148
+ "languages": ["English"],
149
+ "confidence_score": 0.0,
150
+ "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
151
+ "ocr_contents": {
152
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
153
+ "partial_text": "Document could not be processed due to size limitations."
154
+ }
155
+ }
156
+
157
+ # Update progress
158
+ progress_bar.progress(40)
159
+ status_text.text("Processing document with OCR...")
160
+
161
+ # Process the file with file size information for automatic page limiting
162
+ # Make sure we're using the latest mistral-ocr model
163
+ # See https://docs.mistral.ai/capabilities/document/ for more info
164
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
165
+
166
+ # Complete progress
167
+ progress_bar.progress(100)
168
+ status_text.empty()
169
+
170
+ return result
171
+ except Exception as e:
172
+ progress_bar.progress(100)
173
+ status_text.empty()
174
+ st.error(f"Error during processing: {str(e)}")
175
+ raise
176
+ finally:
177
+ # Clean up the temporary file
178
+ if os.path.exists(temp_path):
179
+ os.unlink(temp_path)
180
+
181
+ # App title and description
182
+ st.title("Historical Document OCR")
183
+ st.subheader("Powered by Mistral AI")
184
+
185
+ # Create main layout with tabs and columns
186
+ main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
187
+
188
+ with main_tab1:
189
+ # Create a two-column layout for file upload and preview
190
+ upload_col, preview_col = st.columns([1, 1])
191
+
192
+ # File uploader in the left column
193
+ with upload_col:
194
+ st.markdown("""
195
+ Upload an image or PDF file to get started.
196
+
197
+ Using the latest `mistral-ocr-latest` model for advanced document understanding.
198
+ """)
199
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
200
+
201
+ # Sidebar with options
202
+ with st.sidebar:
203
+ st.header("Options")
204
+
205
+ # Model options
206
+ st.subheader("Model Settings")
207
+ use_vision = st.checkbox("Use Vision Model", value=True,
208
+ help="For image files, use the vision model for improved analysis (may be slower)")
209
+
210
+ # Image preprocessing options (collapsible)
211
+ st.subheader("Image Preprocessing")
212
+ with st.expander("Preprocessing Options"):
213
+ preprocessing_options = {}
214
+ preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
215
+ help="Convert image to grayscale before OCR")
216
+ preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
217
+ help="Apply adaptive thresholding to enhance text")
218
+ preprocessing_options["denoise"] = st.checkbox("Denoise Image",
219
+ help="Remove noise from the image")
220
+ preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
221
+ help="Adjust image contrast (-5 to +5)")
222
+
223
+ # PDF options (collapsible)
224
+ st.subheader("PDF Options")
225
+ with st.expander("PDF Settings"):
226
+ pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
227
+ help="Higher DPI gives better quality but slower processing")
228
+ max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
229
+ help="Limit number of pages to process")
230
+
231
+ # About tab content
232
+ with main_tab2:
233
+ st.markdown("""
234
+ ### About This Application
235
+
236
+ This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
237
+
238
+ It can process:
239
+ - Image files (jpg, png, etc.)
240
+ - PDF documents (multi-page support)
241
+
242
+ The extracted content is processed into structured data based on the document type, combining:
243
+ - Text extraction with `mistral-ocr-latest`
244
+ - Analysis with language models
245
+ - Layout preservation with images
246
+
247
+ View results in three formats:
248
+ - Structured HTML view
249
+ - Raw JSON (for developers)
250
+ - Markdown with images (preserves document layout)
251
+
252
+ **New Features:**
253
+ - Image preprocessing for better OCR quality
254
+ - PDF resolution and page controls
255
+ - Progress tracking during processing
256
+ """)
257
+
258
+ with main_tab1:
259
+ if uploaded_file is not None:
260
+ # Check file size (cap at 50MB)
261
+ file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
262
+
263
+ if file_size_mb > 50:
264
+ with upload_col:
265
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
266
+ st.stop()
267
+
268
+ file_ext = Path(uploaded_file.name).suffix.lower()
269
+
270
+ # Display document preview in preview column
271
+ with preview_col:
272
+ st.subheader("Document Preview")
273
+ if file_ext == ".pdf":
274
+ try:
275
+ # Convert first page of PDF to image for preview
276
+ pdf_bytes = uploaded_file.getvalue()
277
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
278
+
279
+ if images:
280
+ # Convert PIL image to bytes for Streamlit
281
+ first_page = images[0]
282
+ img_bytes = io.BytesIO()
283
+ first_page.save(img_bytes, format='JPEG')
284
+ img_bytes.seek(0)
285
+
286
+ # Display the PDF preview
287
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
288
+ else:
289
+ st.info(f"PDF uploaded: {uploaded_file.name}")
290
+ except Exception:
291
+ # Simply show the file name without an error message
292
+ st.info(f"PDF uploaded: {uploaded_file.name}")
293
+ st.info("Click 'Process Document' to analyze the content.")
294
+ else:
295
+ st.image(uploaded_file, use_container_width=True)
296
+
297
+ # Add image preprocessing preview in a collapsible section if needed
298
+ if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
299
+ with st.expander("Image Preprocessing Preview"):
300
+ preview_cols = st.columns(2)
301
+
302
+ with preview_cols[0]:
303
+ st.markdown("**Original Image**")
304
+ st.image(uploaded_file, use_container_width=True)
305
+
306
+ with preview_cols[1]:
307
+ st.markdown("**Preprocessed Image**")
308
+ try:
309
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
310
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
311
+ except Exception as e:
312
+ st.error(f"Error in preprocessing: {str(e)}")
313
+
314
+ # Process button - flush left with similar padding as file browser
315
+ with upload_col:
316
+ process_button = st.button("Process Document", use_container_width=True)
317
+
318
+ # Results section
319
+ if process_button:
320
+ try:
321
+ # Get max_pages or default if not available
322
+ max_pages_value = max_pages if 'max_pages' in locals() else None
323
+
324
+ # Call process_file with all options
325
+ result = process_file(uploaded_file, use_vision, preprocessing_options)
326
+
327
+ # Create results tabs for better organization
328
+ results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
329
+
330
+ with results_tab1:
331
+ # Create two columns for metadata and content
332
+ meta_col, content_col = st.columns([1, 2])
333
+
334
+ with meta_col:
335
+ st.subheader("Document Metadata")
336
+ st.success("**Document processed successfully**")
337
+
338
+ # Display file info
339
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
340
+
341
+ # Display info if only limited pages were processed
342
+ if 'limited_pages' in result:
343
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
344
+
345
+ # Display languages if available
346
+ if 'languages' in result:
347
+ languages = [lang for lang in result['languages'] if lang is not None]
348
+ if languages:
349
+ st.write(f"**Languages:** {', '.join(languages)}")
350
+
351
+ # Confidence score if available
352
+ if 'confidence_score' in result:
353
+ confidence = result['confidence_score']
354
+ st.write(f"**OCR Confidence:** {confidence:.1%}")
355
+
356
+ # Display topics if available
357
+ if 'topics' in result and result['topics']:
358
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
359
+
360
+ with content_col:
361
+ st.subheader("Document Contents")
362
+ if 'ocr_contents' in result:
363
+ # Check if there are images in the OCR result
364
+ has_images = False
365
+ if 'raw_response' in result:
366
+ try:
367
+ has_images = any(page.images for page in result['raw_response'].pages)
368
+ except Exception:
369
+ has_images = False
370
+
371
+ # Create tabs for different views
372
+ if has_images:
373
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
374
+ else:
375
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
376
+
377
+ with view_tab1:
378
+ # Display in a more user-friendly format based on the content structure
379
+ html_content = ""
380
+ if isinstance(result['ocr_contents'], dict):
381
+ for section, content in result['ocr_contents'].items():
382
+ if content: # Only display non-empty sections
383
+ section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
384
+ html_content += section_title
385
+
386
+ if isinstance(content, str):
387
+ html_content += f"<p>{content}</p>"
388
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
389
+ st.markdown(content)
390
+ elif isinstance(content, list):
391
+ html_list = "<ul>"
392
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
393
+ for item in content:
394
+ if isinstance(item, str):
395
+ html_list += f"<li>{item}</li>"
396
+ st.markdown(f"- {item}")
397
+ elif isinstance(item, dict):
398
+ html_list += f"<li>{json.dumps(item)}</li>"
399
+ st.json(item)
400
+ html_list += "</ul>"
401
+ html_content += html_list
402
+ elif isinstance(content, dict):
403
+ html_dict = "<dl>"
404
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
405
+ for k, v in content.items():
406
+ html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
407
+ st.markdown(f"**{k}:** {v}")
408
+ html_dict += "</dl>"
409
+ html_content += html_dict
410
+
411
+ # Add download button in a smaller section
412
+ with st.expander("Export Content"):
413
+ # Alternative download button
414
+ html_bytes = html_content.encode()
415
+ st.download_button(
416
+ label="Download as HTML",
417
+ data=html_bytes,
418
+ file_name="document_content.html",
419
+ mime="text/html"
420
+ )
421
+
422
+ with view_tab2:
423
+ # Show the raw JSON for developers
424
+ st.json(result)
425
+
426
+ if has_images:
427
+ with view_tab3:
428
+ # Show loading indicator while preparing images
429
+ with st.spinner("Preparing document with embedded images..."):
430
+ try:
431
+ # Import function
432
+ try:
433
+ from ocr_utils import get_combined_markdown
434
+ except ImportError:
435
+ st.error("Required module ocr_utils not found.")
436
+ st.stop()
437
+
438
+ # Check if raw_response is available
439
+ if 'raw_response' not in result:
440
+ st.warning("Raw OCR response not available. Cannot display images.")
441
+ st.stop()
442
+
443
+ # Validate the raw_response structure before processing
444
+ if not hasattr(result['raw_response'], 'pages'):
445
+ st.warning("Invalid OCR response format. Cannot display images.")
446
+ st.stop()
447
+
448
+ # Get the combined markdown with images
449
+ combined_markdown = get_combined_markdown(result['raw_response'])
450
+
451
+ if not combined_markdown or combined_markdown.strip() == "":
452
+ st.warning("No image content found in the document.")
453
+ st.stop()
454
+
455
+ # Add CSS to ensure proper spacing and handling of text and images
456
+ st.markdown("""
457
+ <style>
458
+ .markdown-text-container {
459
+ padding: 10px;
460
+ background-color: #f9f9f9;
461
+ border-radius: 5px;
462
+ }
463
+ .markdown-text-container img {
464
+ margin: 15px 0;
465
+ max-width: 100%;
466
+ border: 1px solid #ddd;
467
+ border-radius: 4px;
468
+ display: block;
469
+ }
470
+ .markdown-text-container p {
471
+ margin-bottom: 16px;
472
+ line-height: 1.6;
473
+ }
474
+ </style>
475
+ """, unsafe_allow_html=True)
476
+
477
+ # Wrap the markdown in a div with the class for styling
478
+ st.markdown(f"""
479
+ <div class="markdown-text-container">
480
+ {combined_markdown}
481
+ </div>
482
+ """, unsafe_allow_html=True)
483
+
484
+ # Add a download button for the combined content
485
+ st.download_button(
486
+ label="Download with Images (HTML)",
487
+ data=f"""
488
+ <html>
489
+ <head>
490
+ <style>
491
+ body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
492
+ img {{ max-width: 100%; margin: 15px 0; }}
493
+ </style>
494
+ </head>
495
+ <body>
496
+ {combined_markdown}
497
+ </body>
498
+ </html>
499
+ """,
500
+ file_name="document_with_images.html",
501
+ mime="text/html"
502
+ )
503
+
504
+ except Exception as e:
505
+ st.error(f"Could not display document with images: {str(e)}")
506
+ st.info("Try refreshing or processing the document again.")
507
+ else:
508
+ st.error("No OCR content was extracted from the document.")
509
+
510
+ with results_tab2:
511
+ st.subheader("Raw Processing Results")
512
+ st.json(result)
513
+
514
+ except Exception as e:
515
+ st.error(f"Error processing document: {str(e)}")
516
+ else:
517
+ # Display sample images in the main area when no file is uploaded
518
+ st.info("Upload a document to get started using the file uploader above.")
519
+
520
+ # Show example images in a grid
521
+ st.subheader("Example Documents")
522
+
523
+ # Add a sample images container
524
+ with st.container():
525
+ # Find sample images from the input directory to display
526
+ input_dir = Path(__file__).parent / "input"
527
+ sample_images = []
528
+ if input_dir.exists():
529
+ sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
530
+
531
+ if sample_images:
532
+ columns = st.columns(3)
533
+ for i, img_path in enumerate(sample_images):
534
+ with columns[i % 3]:
535
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
backup/config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ """
3
+ Configuration file for Mistral OCR processing.
4
+ Contains API key and other settings.
5
+ """
6
+ import os
7
+
8
+ # Your Mistral API key - get from Hugging Face secrets or environment variable
9
+ # The priority order is: HF_SPACES environment var > regular environment var > empty string
10
+ # Note: No default API key is provided for security reasons
11
+ MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
12
+ os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
13
+
14
+ # Model settings
15
+ OCR_MODEL = "mistral-ocr-latest"
16
+ TEXT_MODEL = "ministral-8b-latest"
17
+ VISION_MODEL = "pixtral-12b-latest"
input/magician-or-bottle-cungerer.jpg → backup/input/The Magician, or Bottle Cungerer.jpeg RENAMED
File without changes
input/baldwin-15th-north.jpg → backup/input/baldwin-letter-1.jpg RENAMED
File without changes
input/americae-retectio.jpg → backup/input/baldwin-letter-2.jpg RENAMED
File without changes
backup/input/flier.png ADDED
input/baldwin-letter.jpg → backup/input/letter-1.jpg RENAMED
File without changes
input/gender.jpg → backup/input/letter-2.jpg RENAMED
File without changes
input/photo-baldwin-letter.jpg → backup/input/letter-3.jpg RENAMED
File without changes
backup/input/magellan-travels.jpg ADDED

Git LFS Details

  • SHA256: ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
  • Pointer size: 131 Bytes
  • Size of remote file: 283 kB
input/handwritten-journal.jpg → backup/input/menu.pdf RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:279f7c915ae54aafb30e6d70e480eb74e73b6aa92de20f60cd13019e9debbb62
3
- size 1459485
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
backup/input/recipe.jpg ADDED
backup/ocr_utils.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for OCR processing with Mistral AI.
3
+ Contains helper functions for working with OCR responses and image handling.
4
+ """
5
+
6
+ import json
7
+ import base64
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Union
10
+
11
+ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
12
+
13
+ def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
14
+ """
15
+ Replace image placeholders in markdown with base64-encoded images.
16
+
17
+ Args:
18
+ markdown_str: Markdown text containing image placeholders
19
+ images_dict: Dictionary mapping image IDs to base64 strings
20
+
21
+ Returns:
22
+ Markdown text with images replaced by base64 data
23
+ """
24
+ for img_name, base64_str in images_dict.items():
25
+ markdown_str = markdown_str.replace(
26
+ f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
27
+ )
28
+ return markdown_str
29
+
30
+ def get_combined_markdown(ocr_response) -> str:
31
+ """
32
+ Combine OCR text and images into a single markdown document.
33
+ Ensures proper spacing between text and images.
34
+
35
+ Args:
36
+ ocr_response: Response from OCR processing containing text and images
37
+ See https://docs.mistral.ai/capabilities/document/ for API reference
38
+
39
+ Returns:
40
+ Combined markdown string with embedded images
41
+ """
42
+ markdowns: list[str] = []
43
+ # Extract images from page
44
+ for page in ocr_response.pages:
45
+ image_data = {}
46
+ for img in page.images:
47
+ image_data[img.id] = img.image_base64
48
+
49
+ # Replace image placeholders with actual images
50
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
51
+
52
+ # Ensure proper spacing between paragraphs and images
53
+ # Add extra newlines between paragraphs to improve rendering
54
+ page_markdown = page_markdown.replace("\n", "\n\n")
55
+
56
+ # Add page separator for multi-page documents
57
+ markdowns.append(page_markdown)
58
+
59
+ # Join pages with clear separators for multi-page documents
60
+ return "\n\n---\n\n".join(markdowns)
61
+
62
+ def encode_image_for_api(image_path: Union[str, Path]) -> str:
63
+ """
64
+ Encode an image as base64 for API use.
65
+
66
+ Args:
67
+ image_path: Path to the image file
68
+
69
+ Returns:
70
+ Base64 data URL for the image
71
+ """
72
+ # Convert to Path object if string
73
+ image_file = Path(image_path) if isinstance(image_path, str) else image_path
74
+
75
+ # Verify image exists
76
+ if not image_file.is_file():
77
+ raise FileNotFoundError(f"Image file not found: {image_file}")
78
+
79
+ # Encode image as base64
80
+ encoded = base64.b64encode(image_file.read_bytes()).decode()
81
+ return f"data:image/jpeg;base64,{encoded}"
82
+
83
+ def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
84
+ """
85
+ Process an image with OCR and return the response.
86
+
87
+ Args:
88
+ client: Mistral AI client
89
+ image_path: Path to the image file
90
+ model: OCR model to use
91
+
92
+ Returns:
93
+ OCR response object
94
+ """
95
+ # Encode image as base64
96
+ base64_data_url = encode_image_for_api(image_path)
97
+
98
+ # Process image with OCR
99
+ image_response = client.ocr.process(
100
+ document=ImageURLChunk(image_url=base64_data_url),
101
+ model=model
102
+ )
103
+
104
+ return image_response
105
+
106
+ def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
107
+ """
108
+ Convert OCR response to a formatted JSON string.
109
+
110
+ Args:
111
+ ocr_response: OCR response object
112
+ indent: Indentation level for JSON formatting
113
+
114
+ Returns:
115
+ Formatted JSON string
116
+ """
117
+ # Convert response to JSON
118
+ response_dict = json.loads(ocr_response.model_dump_json())
119
+ return json.dumps(response_dict, indent=indent)
120
+
121
+ # For display in notebooks
122
+ try:
123
+ from IPython.display import Markdown, display
124
+
125
+ def display_ocr_with_images(ocr_response):
126
+ """
127
+ Display OCR response with embedded images in IPython environments.
128
+
129
+ Args:
130
+ ocr_response: OCR response object
131
+ """
132
+ combined_markdown = get_combined_markdown(ocr_response)
133
+ display(Markdown(combined_markdown))
134
+ except ImportError:
135
+ # IPython not available
136
+ pass
backup/pdf_ocr.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDFOCR - Module for processing PDF files with OCR and extracting structured data.
4
+ """
5
+
6
+ import json
7
+ from pathlib import Path
8
+ from structured_ocr import StructuredOCR
9
+
10
+ class PDFOCR:
11
+ """Class for processing PDF files with OCR and extracting structured data."""
12
+
13
+ def __init__(self, api_key=None):
14
+ """Initialize the PDF OCR processor."""
15
+ self.processor = StructuredOCR(api_key=api_key)
16
+
17
+ def process_pdf(self, pdf_path, use_vision=True):
18
+ """
19
+ Process a PDF file with OCR and extract structured data.
20
+
21
+ Args:
22
+ pdf_path: Path to the PDF file
23
+ use_vision: Whether to use vision model for improved analysis
24
+
25
+ Returns:
26
+ Dictionary with structured OCR results
27
+ """
28
+ pdf_path = Path(pdf_path)
29
+ if not pdf_path.exists():
30
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
31
+
32
+ return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
33
+
34
+ def save_json_output(self, pdf_path, output_path, use_vision=True):
35
+ """
36
+ Process a PDF file and save the structured output as JSON.
37
+
38
+ Args:
39
+ pdf_path: Path to the PDF file
40
+ output_path: Path where to save the JSON output
41
+ use_vision: Whether to use vision model for improved analysis
42
+
43
+ Returns:
44
+ Path to the saved JSON file
45
+ """
46
+ # Process the PDF
47
+ result = self.process_pdf(pdf_path, use_vision=use_vision)
48
+
49
+ # Save the result to JSON
50
+ output_path = Path(output_path)
51
+ output_path.parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ with open(output_path, 'w') as f:
54
+ json.dump(result, f, indent=2)
55
+
56
+ return output_path
57
+
58
+ # For testing directly
59
+ if __name__ == "__main__":
60
+ import sys
61
+
62
+ if len(sys.argv) < 2:
63
+ print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
64
+ sys.exit(1)
65
+
66
+ pdf_path = sys.argv[1]
67
+ output_path = sys.argv[2] if len(sys.argv) > 2 else None
68
+
69
+ processor = PDFOCR()
70
+
71
+ if output_path:
72
+ result_path = processor.save_json_output(pdf_path, output_path)
73
+ print(f"Results saved to: {result_path}")
74
+ else:
75
+ result = processor.process_pdf(pdf_path)
76
+ print(json.dumps(result, indent=2))
backup/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.43.2
2
+ mistralai>=0.0.7
3
+ pydantic>=2.0.0
4
+ pycountry>=23.12.11
5
+ pillow>=10.0.0
6
+ python-multipart>=0.0.6
7
+ pdf2image>=1.17.0
8
+ pytesseract>=0.3.10
9
+ opencv-python-headless>=4.6.0
10
+ numpy>=1.23.5
backup/structured_ocr.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ import json
7
+ import base64
8
+ import pycountry
9
+ import logging
10
+ from pydantic import BaseModel
11
+ from mistralai import Mistral
12
+ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+
18
+ # Import utilities for OCR processing
19
+ try:
20
+ from ocr_utils import replace_images_in_markdown, get_combined_markdown
21
+ except ImportError:
22
+ # Define fallback functions if module not found
23
+ def replace_images_in_markdown(markdown_str, images_dict):
24
+ for img_name, base64_str in images_dict.items():
25
+ markdown_str = markdown_str.replace(
26
+ f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
27
+ )
28
+ return markdown_str
29
+
30
+ def get_combined_markdown(ocr_response):
31
+ markdowns = []
32
+ for page in ocr_response.pages:
33
+ image_data = {}
34
+ for img in page.images:
35
+ image_data[img.id] = img.image_base64
36
+ markdowns.append(replace_images_in_markdown(page.markdown, image_data))
37
+ return "\n\n".join(markdowns)
38
+
39
+ # Import config directly (now local to historical-ocr)
40
+ from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
41
+
42
+ # Create language enum for structured output
43
+ languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
44
+
45
+ class LanguageMeta(Enum.__class__):
46
+ def __new__(metacls, cls, bases, classdict):
47
+ for code, name in languages.items():
48
+ classdict[name.upper().replace(' ', '_')] = name
49
+ return super().__new__(metacls, cls, bases, classdict)
50
+
51
+ class Language(Enum, metaclass=LanguageMeta):
52
+ pass
53
+
54
+ class StructuredOCRModel(BaseModel):
55
+ file_name: str
56
+ topics: list[str]
57
+ languages: list[Language]
58
+ ocr_contents: dict
59
+
60
+ class StructuredOCR:
61
+ def __init__(self, api_key=None):
62
+ """Initialize the OCR processor with API key"""
63
+ self.api_key = api_key or MISTRAL_API_KEY
64
+ self.client = Mistral(api_key=self.api_key)
65
+
66
+ def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
67
+ """Process a file and return structured OCR results
68
+
69
+ Args:
70
+ file_path: Path to the file to process
71
+ file_type: 'pdf' or 'image' (will be auto-detected if None)
72
+ use_vision: Whether to use vision model for improved analysis
73
+ max_pages: Optional limit on number of pages to process
74
+ file_size_mb: Optional file size in MB (used for automatic page limiting)
75
+ custom_pages: Optional list of specific page numbers to process
76
+
77
+ Returns:
78
+ Dictionary with structured OCR results
79
+ """
80
+ # Convert file_path to Path object if it's a string
81
+ file_path = Path(file_path)
82
+
83
+ # Auto-detect file type if not provided
84
+ if file_type is None:
85
+ suffix = file_path.suffix.lower()
86
+ file_type = "pdf" if suffix == ".pdf" else "image"
87
+
88
+ # Get file size if not provided
89
+ if file_size_mb is None and file_path.exists():
90
+ file_size_mb = file_path.stat().st_size / (1024 * 1024) # Convert bytes to MB
91
+
92
+ # Check if file exceeds API limits (50 MB)
93
+ if file_size_mb and file_size_mb > 50:
94
+ logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
95
+ return {
96
+ "file_name": file_path.name,
97
+ "topics": ["Document"],
98
+ "languages": ["English"],
99
+ "confidence_score": 0.0,
100
+ "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
101
+ "ocr_contents": {
102
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
103
+ "partial_text": "Document could not be processed due to size limitations."
104
+ }
105
+ }
106
+
107
+ # For PDF files, limit pages based on file size if no explicit limit is given
108
+ if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
109
+ if file_size_mb > 100: # Very large files
110
+ max_pages = 3
111
+ elif file_size_mb > 50: # Large files
112
+ max_pages = 5
113
+ elif file_size_mb > 20: # Medium files
114
+ max_pages = 10
115
+ else: # Small files
116
+ max_pages = None # Process all pages
117
+
118
+ # Start processing timer
119
+ start_time = time.time()
120
+
121
+ # Read and process the file
122
+ if file_type == "pdf":
123
+ result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
124
+ else:
125
+ result = self._process_image(file_path, use_vision)
126
+
127
+ # Add processing time information
128
+ processing_time = time.time() - start_time
129
+ result['processing_time'] = processing_time
130
+
131
+ # Add a default confidence score if not present
132
+ if 'confidence_score' not in result:
133
+ result['confidence_score'] = 0.85 # Default confidence
134
+
135
+ return result
136
+
137
+ def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
138
+ """Process a PDF file with OCR
139
+
140
+ Args:
141
+ file_path: Path to the PDF file
142
+ use_vision: Whether to use vision model
143
+ max_pages: Optional limit on the number of pages to process
144
+ custom_pages: Optional list of specific page numbers to process
145
+ """
146
+ logger = logging.getLogger("pdf_processor")
147
+ logger.info(f"Processing PDF: {file_path}")
148
+
149
+ try:
150
+ # Upload the PDF file
151
+ logger.info("Uploading PDF file to Mistral API")
152
+ uploaded_file = self.client.files.upload(
153
+ file={
154
+ "file_name": file_path.stem,
155
+ "content": file_path.read_bytes(),
156
+ },
157
+ purpose="ocr",
158
+ )
159
+
160
+ # Get a signed URL for the uploaded file
161
+ signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
162
+
163
+ # Process the PDF with OCR
164
+ logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
165
+ pdf_response = self.client.ocr.process(
166
+ document=DocumentURLChunk(document_url=signed_url.url),
167
+ model=OCR_MODEL,
168
+ include_image_base64=True
169
+ )
170
+
171
+ # Limit pages if requested
172
+ pages_to_process = pdf_response.pages
173
+ total_pages = len(pdf_response.pages)
174
+ limited_pages = False
175
+
176
+ logger.info(f"PDF has {total_pages} total pages")
177
+
178
+ # Handle custom page selection if provided
179
+ if custom_pages:
180
+ # Convert to 0-based indexing and filter valid page numbers
181
+ valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
182
+ if valid_indices:
183
+ pages_to_process = [pdf_response.pages[i] for i in valid_indices]
184
+ limited_pages = True
185
+ logger.info(f"Processing {len(valid_indices)} custom-selected pages")
186
+ # Otherwise handle max_pages limit
187
+ elif max_pages and total_pages > max_pages:
188
+ pages_to_process = pages_to_process[:max_pages]
189
+ limited_pages = True
190
+ logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
191
+
192
+ # Calculate average confidence score based on OCR response if available
193
+ confidence_score = 0.0
194
+ try:
195
+ # Some OCR APIs provide confidence scores
196
+ confidence_values = []
197
+ for page in pages_to_process:
198
+ if hasattr(page, 'confidence'):
199
+ confidence_values.append(page.confidence)
200
+
201
+ if confidence_values:
202
+ confidence_score = sum(confidence_values) / len(confidence_values)
203
+ else:
204
+ confidence_score = 0.85 # Default if no confidence scores available
205
+ except:
206
+ confidence_score = 0.85 # Default fallback
207
+
208
+ # Combine pages' markdown into a single string
209
+ all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
210
+
211
+ # Extract structured data using the appropriate model
212
+ if use_vision:
213
+ # Get base64 of first page for vision model
214
+ first_page_image = None
215
+ if pages_to_process and pages_to_process[0].images:
216
+ first_page_image = pages_to_process[0].images[0].image_base64
217
+
218
+ if first_page_image:
219
+ # Use vision model
220
+ logger.info(f"Using vision model: {VISION_MODEL}")
221
+ result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
222
+ else:
223
+ # Fall back to text-only model if no image available
224
+ logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
225
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
226
+ else:
227
+ # Use text-only model
228
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
229
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
230
+
231
+ # Add page limit info to result if needed
232
+ if limited_pages:
233
+ result['limited_pages'] = {
234
+ 'processed': len(pages_to_process),
235
+ 'total': total_pages
236
+ }
237
+
238
+ # Add confidence score
239
+ result['confidence_score'] = confidence_score
240
+
241
+ # Store the raw OCR response for image rendering
242
+ result['raw_response'] = pdf_response
243
+
244
+ logger.info(f"PDF processing completed successfully")
245
+ return result
246
+
247
+ except Exception as e:
248
+ logger.error(f"Error processing PDF: {str(e)}")
249
+ # Return basic result on error
250
+ return {
251
+ "file_name": file_path.name,
252
+ "topics": ["Document"],
253
+ "languages": ["English"],
254
+ "confidence_score": 0.0,
255
+ "error": str(e),
256
+ "ocr_contents": {
257
+ "error": f"Failed to process PDF: {str(e)}",
258
+ "partial_text": "Document could not be fully processed."
259
+ }
260
+ }
261
+
262
+ def _process_image(self, file_path, use_vision=True):
263
+ """Process an image file with OCR"""
264
+ logger = logging.getLogger("image_processor")
265
+ logger.info(f"Processing image: {file_path}")
266
+
267
+ try:
268
+ # Read and encode the image file
269
+ logger.info("Encoding image for API")
270
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
271
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
272
+
273
+ # Process the image with OCR
274
+ logger.info(f"Processing image with OCR using {OCR_MODEL}")
275
+ image_response = self.client.ocr.process(
276
+ document=ImageURLChunk(image_url=base64_data_url),
277
+ model=OCR_MODEL,
278
+ include_image_base64=True
279
+ )
280
+
281
+ # Get the OCR markdown from the first page
282
+ image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
283
+
284
+ # Calculate confidence score if available
285
+ confidence_score = 0.85 # Default value
286
+ try:
287
+ if hasattr(image_response.pages[0], 'confidence'):
288
+ confidence_score = image_response.pages[0].confidence
289
+ except:
290
+ pass
291
+
292
+ # Extract structured data using the appropriate model
293
+ if use_vision:
294
+ logger.info(f"Using vision model: {VISION_MODEL}")
295
+ result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
296
+ else:
297
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
298
+ result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
299
+
300
+ # Add confidence score
301
+ result['confidence_score'] = confidence_score
302
+
303
+ # Store the raw OCR response for image rendering
304
+ result['raw_response'] = image_response
305
+
306
+ logger.info("Image processing completed successfully")
307
+ return result
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error processing image: {str(e)}")
311
+ # Return basic result on error
312
+ return {
313
+ "file_name": file_path.name,
314
+ "topics": ["Document"],
315
+ "languages": ["English"],
316
+ "confidence_score": 0.0,
317
+ "error": str(e),
318
+ "ocr_contents": {
319
+ "error": f"Failed to process image: {str(e)}",
320
+ "partial_text": "Image could not be processed."
321
+ }
322
+ }
323
+
324
+ def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
325
+ """Extract structured data using vision model"""
326
+ try:
327
+ # Parse with vision model with a timeout
328
+ chat_response = self.client.chat.parse(
329
+ model=VISION_MODEL,
330
+ messages=[
331
+ {
332
+ "role": "user",
333
+ "content": [
334
+ ImageURLChunk(image_url=image_base64),
335
+ TextChunk(text=(
336
+ f"This is a historical document's OCR in markdown:\n"
337
+ f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
338
+ f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
339
+ f"Extract topics, languages, and organize the content logically."
340
+ ))
341
+ ],
342
+ },
343
+ ],
344
+ response_format=StructuredOCRModel,
345
+ temperature=0
346
+ )
347
+
348
+ # Convert the response to a dictionary
349
+ result = json.loads(chat_response.choices[0].message.parsed.json())
350
+
351
+ # Ensure languages is a list of strings, not Language enum objects
352
+ if 'languages' in result:
353
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
354
+
355
+ except Exception as e:
356
+ # Fall back to text-only model if vision model fails
357
+ print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
358
+ result = self._extract_structured_data_text_only(ocr_markdown, filename)
359
+
360
+ return result
361
+
362
+ def _extract_structured_data_text_only(self, ocr_markdown, filename):
363
+ """Extract structured data using text-only model"""
364
+ try:
365
+ # Parse with text-only model with a timeout
366
+ chat_response = self.client.chat.parse(
367
+ model=TEXT_MODEL,
368
+ messages=[
369
+ {
370
+ "role": "user",
371
+ "content": f"This is a historical document's OCR in markdown:\n"
372
+ f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
373
+ f"Convert this into a structured JSON response with the OCR contents. "
374
+ f"Extract topics, languages, and organize the content logically."
375
+ },
376
+ ],
377
+ response_format=StructuredOCRModel,
378
+ temperature=0
379
+ )
380
+
381
+ # Convert the response to a dictionary
382
+ result = json.loads(chat_response.choices[0].message.parsed.json())
383
+
384
+ # Ensure languages is a list of strings, not Language enum objects
385
+ if 'languages' in result:
386
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
387
+
388
+ except Exception as e:
389
+ # Create a basic result if parsing fails
390
+ print(f"Text model failed: {str(e)}. Creating basic result.")
391
+ result = {
392
+ "file_name": filename,
393
+ "topics": ["Document"],
394
+ "languages": ["English"],
395
+ "ocr_contents": {
396
+ "raw_text": ocr_markdown
397
+ }
398
+ }
399
+
400
+ return result
401
+
402
+ # For testing directly
403
+ if __name__ == "__main__":
404
+ import sys
405
+
406
+ if len(sys.argv) < 2:
407
+ print("Usage: python structured_ocr.py <file_path>")
408
+ sys.exit(1)
409
+
410
+ file_path = sys.argv[1]
411
+ processor = StructuredOCR()
412
+ result = processor.process_file(file_path)
413
+
414
+ print(json.dumps(result, indent=2))
config.py CHANGED
@@ -4,64 +4,14 @@ Configuration file for Mistral OCR processing.
4
  Contains API key and other settings.
5
  """
6
  import os
7
- import logging
8
- from dotenv import load_dotenv
9
 
10
- # Configure logging
11
- logger = logging.getLogger("config")
 
 
 
12
 
13
- # Load environment variables from .env file if it exists
14
- load_dotenv()
15
-
16
- # Mistral API key handling - prioritizing Hugging Face environment
17
- # Priority order:
18
- # 1. HF_API_KEY environment variable (Hugging Face standard)
19
- # 2. HUGGING_FACE_API_KEY environment variable (alternative name)
20
- # 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
21
- # 4. MISTRAL_API_KEY environment variable (fallback)
22
- # 5. Empty string (will show warning in app)
23
-
24
- MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
25
- os.environ.get("HUGGING_FACE_API_KEY",
26
- os.environ.get("HF_MISTRAL_API_KEY",
27
- os.environ.get("MISTRAL_API_KEY", "")))).strip()
28
-
29
- if not MISTRAL_API_KEY:
30
- logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
31
-
32
- # Check if we're in test mode (allows operation without valid API key)
33
- # Set to False to use actual API calls with Mistral API
34
- TEST_MODE = False
35
-
36
- # Model settings with fallbacks
37
- OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
38
- TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
39
- VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # faster model that supports vision
40
-
41
- # Image preprocessing settings optimized for historical documents
42
- # These can be customized from environment variables
43
- IMAGE_PREPROCESSING = {
44
- "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")), # Increased contrast for better text recognition
45
- "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
46
- "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
47
- "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")), # Increased size limit for better quality
48
- "target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
49
- "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
50
- # # Enhanced settings for handwritten documents
51
- "handwritten": {
52
- "block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
53
- "constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")), # Lower constant for adaptive thresholding
54
- "use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"), # Connect broken strokes
55
- "dilation_iterations": int(os.environ.get("HANDWRITTEN_DILATION_ITERATIONS", "2")), # More iterations for better stroke connection
56
- "dilation_kernel_size": int(os.environ.get("HANDWRITTEN_DILATION_KERNEL_SIZE", "3")) # Larger kernel for dilation
57
- }
58
- }
59
-
60
- # OCR settings optimized for single-page performance
61
- OCR_SETTINGS = {
62
- "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")), # Shorter timeout for single pages (45 seconds)
63
- "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")), # Fewer retries to avoid rate-limiting
64
- "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")), # Shorter initial retry delay for faster execution
65
- "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
66
- "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2")) # Lower thread count to prevent API rate limiting
67
- }
 
4
  Contains API key and other settings.
5
  """
6
  import os
 
 
7
 
8
+ # Your Mistral API key - get from Hugging Face secrets or environment variable
9
+ # The priority order is: HF_SPACES environment var > regular environment var > empty string
10
+ # Note: No default API key is provided for security reasons
11
+ MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
12
+ os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
13
 
14
+ # Model settings
15
+ OCR_MODEL = "mistral-ocr-latest"
16
+ TEXT_MODEL = "ministral-8b-latest"
17
+ VISION_MODEL = "pixtral-12b-latest"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
constants.py DELETED
@@ -1,193 +0,0 @@
1
- """
2
- Constants for the Historical OCR application.
3
-
4
- This module contains all the constants used throughout the application,
5
- making it easier to maintain and update values in one place.
6
- """
7
-
8
- # API limits
9
- MAX_FILE_SIZE_MB = 200
10
- MAX_PAGES = 20
11
-
12
- # Caching
13
- CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
14
- MAX_CACHE_ENTRIES = 20
15
-
16
- # Image processing
17
- MAX_IMAGE_DIMENSION = 2500
18
- IMAGE_QUALITY = 100
19
-
20
- # Document types
21
- DOCUMENT_TYPES = [
22
- "Auto-detect (standard processing)",
23
- "Newspaper or Magazine",
24
- "Letter or Correspondence",
25
- "Book or Publication",
26
- "Form or Legal Document",
27
- "Recipe",
28
- "Handwritten Document",
29
- "Map or Illustration",
30
- "Table or Spreadsheet",
31
- "Other (specify in instructions)"
32
- ]
33
-
34
- # Document layouts
35
- DOCUMENT_LAYOUTS = [
36
- "Standard layout",
37
- "Multiple columns",
38
- "Table/grid format",
39
- "Mixed layout with images"
40
- ]
41
-
42
- # Preprocessing document types
43
- PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
44
-
45
- # Rotation options
46
- ROTATION_OPTIONS = [0, 90, 180, 270]
47
-
48
- # PDF settings
49
- DEFAULT_PDF_DPI = 100
50
- MIN_PDF_DPI = 72
51
- MAX_PDF_DPI = 300
52
- DEFAULT_MAX_PAGES = 3
53
-
54
- # Performance modes
55
- PERFORMANCE_MODES = ["Quality", "Speed"]
56
-
57
- # Custom prompt templates
58
- CUSTOM_PROMPT_TEMPLATES = {
59
- "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
60
- "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
61
- "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
62
- "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
63
- "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
64
- "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
65
- "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
66
- "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
67
- "Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
68
- }
69
-
70
- # Layout prompt additions
71
- LAYOUT_PROMPT_ADDITIONS = {
72
- "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
73
- "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
74
- "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
75
- }
76
-
77
- # Content themes for subject tag extraction
78
- CONTENT_THEMES = {
79
- # Historical Periods
80
- "Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"],
81
- "Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"],
82
- "Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"],
83
- "Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"],
84
- "Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"],
85
- "18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"],
86
- "19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"],
87
- "20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"],
88
- "Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"],
89
-
90
- # Geographic Contexts
91
- "European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"],
92
- "Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"],
93
- "African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"],
94
- "American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"],
95
- "Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"],
96
- "Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"],
97
-
98
- # Historical Methodologies & Approaches
99
- "Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"],
100
- "Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"],
101
- "Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"],
102
- "Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"],
103
- "Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"],
104
-
105
- # Historical Document Types
106
- "Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"],
107
- "Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"],
108
- "Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"],
109
- "Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"],
110
- "Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"],
111
- "Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"],
112
-
113
- # Historical Themes & Movements
114
- "Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"],
115
- "Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"],
116
- "Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"],
117
- "Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"],
118
- "Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"],
119
- "Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"],
120
- "Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"],
121
- "Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"],
122
- "Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"],
123
-
124
- # Specialized Historical Topics
125
- "Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"],
126
- "Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"],
127
- "Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"],
128
- "Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"],
129
- "Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"],
130
- "Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"],
131
- "Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"],
132
- "Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"],
133
-
134
- # General Historical Terms
135
- "Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"],
136
- "Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"],
137
- "Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"]
138
- }
139
-
140
- # Period tags based on year ranges
141
- # These ranges are used to assign historical period tags to documents based on their year.
142
- PERIOD_TAGS = {
143
- (0, 499): "Ancient Era (to 500 CE)",
144
- (500, 999): "Early Medieval (500–1000)",
145
- (1000, 1299): "High Medieval (1000–1300)",
146
- (1300, 1499): "Late Medieval (1300–1500)",
147
- (1500, 1599): "Renaissance (1500–1600)",
148
- (1600, 1699): "Early Modern (1600–1700)",
149
- (1700, 1775): "Enlightenment (1700–1775)",
150
- (1776, 1799): "Age of Revolutions (1776–1800)",
151
- (1800, 1849): "Early 19th Century (1800–1850)",
152
- (1850, 1899): "Late 19th Century (1850–1900)",
153
- (1900, 1918): "Early 20th Century & WWI (1900–1918)",
154
- (1919, 1938): "Interwar Period (1919–1938)",
155
- (1939, 1945): "World War II (1939–1945)",
156
- (1946, 1968): "Postwar & Mid-20th Century (1946–1968)",
157
- (1969, 1989): "Late 20th Century (1969–1989)",
158
- (1990, 2000): "Turn of the 21st Century (1990–2000)",
159
- (2001, 2099): "Contemporary (21st Century)"
160
- }
161
-
162
- # Default fallback tags for documents when no specific tags are detected.
163
- DEFAULT_TAGS = [
164
- "Document",
165
- "Historical",
166
- "Text",
167
- "Primary Source",
168
- "Archival Material",
169
- "Record",
170
- "Manuscript",
171
- "Printed Material",
172
- "Correspondence",
173
- "Publication"
174
- ]
175
-
176
- # Generic tags that can be used for broad categorization or as supplemental tags.
177
- GENERIC_TAGS = [
178
- "Archive",
179
- "Content",
180
- "Record",
181
- "Source",
182
- "Material",
183
- "Page",
184
- "Scan",
185
- "Image",
186
- "Transcription",
187
- "Uncategorized",
188
- "General",
189
- "Miscellaneous"
190
- ]
191
-
192
- # UI constants
193
- PROGRESS_DELAY = 0.8 # Seconds to show completion message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
error_handler.py DELETED
@@ -1,65 +0,0 @@
1
- import logging
2
- import streamlit as st
3
- import time
4
- from constants import MAX_FILE_SIZE_MB
5
-
6
- # Configure logging
7
- logger = logging.getLogger("error_handler")
8
- logger.setLevel(logging.INFO)
9
-
10
- def handle_ocr_error(exception, progress_reporter=None):
11
- """
12
- Handle OCR processing errors and provide user-friendly messages
13
-
14
- Args:
15
- exception: The exception that occurred
16
- progress_reporter: ProgressReporter instance for UI updates
17
-
18
- Returns:
19
- str: User-friendly error message
20
- """
21
- error_message = str(exception)
22
-
23
- # Complete progress reporting if provided
24
- if progress_reporter:
25
- progress_reporter.complete(success=False)
26
-
27
- # Check for specific error types and provide helpful user-facing messages
28
- if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
29
- friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
30
- logger.error(f"Rate limit error: {error_message}")
31
- return friendly_message
32
- elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
33
- friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
34
- logger.error(f"API quota error: {error_message}")
35
- return friendly_message
36
- elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
37
- friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
38
- logger.error(f"Timeout error: {error_message}")
39
- return friendly_message
40
- elif "file size" in error_message.lower() or "too large" in error_message.lower():
41
- friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
42
- logger.error(f"File size error: {error_message}")
43
- return friendly_message
44
- else:
45
- # Generic error message for other errors
46
- logger.error(f"OCR processing error: {error_message}", exc_info=True)
47
- return f"An error occurred during processing: {error_message}"
48
-
49
- def check_file_size(file_bytes):
50
- """
51
- Check if file size is within limits
52
-
53
- Args:
54
- file_bytes: File content as bytes
55
-
56
- Returns:
57
- tuple: (is_valid, file_size_mb, error_message)
58
- """
59
- file_size_mb = len(file_bytes) / (1024 * 1024)
60
-
61
- if file_size_mb > MAX_FILE_SIZE_MB:
62
- error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
63
- return False, file_size_mb, error_message
64
-
65
- return True, file_size_mb, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
image_segmentation.py DELETED
@@ -1,253 +0,0 @@
1
- """
2
- Image segmentation utility for OCR preprocessing.
3
- Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
4
- Uses content-aware adaptive segmentation for improved results across document types.
5
- """
6
-
7
- import cv2
8
- import numpy as np
9
- from PIL import Image
10
- import io
11
- import base64
12
- import logging
13
- from pathlib import Path
14
- from typing import Tuple, List, Dict, Union, Optional
15
-
16
- # Configure logging
17
- logging.basicConfig(level=logging.INFO,
18
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
19
- logger = logging.getLogger(__name__)
20
-
21
- def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
22
- """
23
- Prepare image for OCR processing using content-aware segmentation.
24
- Uses adaptive region detection based on text density analysis.
25
-
26
- Args:
27
- image_path: Path to the image file
28
- vision_enabled: Whether the vision model is enabled
29
- preserve_content: Whether to preserve original content without enhancement
30
-
31
- Returns:
32
- Dict containing segmentation results
33
- """
34
- # Convert to Path object if string
35
- image_file = Path(image_path) if isinstance(image_path, str) else image_path
36
-
37
- # Log start of processing
38
- logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
39
-
40
- try:
41
- # Open original image with PIL
42
- with Image.open(image_file) as pil_img:
43
- # Check for low entropy images when vision is disabled
44
- if not vision_enabled:
45
- from utils.image_utils import calculate_image_entropy
46
- ent = calculate_image_entropy(pil_img)
47
- if ent < 3.5: # Likely line-art or blank page
48
- logger.info(f"Low entropy image detected ({ent:.2f}), classifying as illustration")
49
- return {
50
- 'text_regions': None,
51
- 'image_regions': pil_img,
52
- 'text_mask_base64': None,
53
- 'combined_result': None,
54
- 'text_regions_coordinates': []
55
- }
56
-
57
- # Convert to RGB if needed
58
- if pil_img.mode != 'RGB':
59
- pil_img = pil_img.convert('RGB')
60
-
61
- # Get image dimensions
62
- img_np = np.array(pil_img)
63
- img_width, img_height = pil_img.size
64
-
65
- # Analyze text density to determine if advanced segmentation is needed
66
- # This replaces document-specific logic with content-aware analysis
67
- from utils.image_utils import estimate_text_density
68
- text_density = estimate_text_density(img_np)
69
-
70
- # Use adaptive approach for documents with unusual text distribution
71
- if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
72
- logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
73
-
74
- # Detect content regions based on text density
75
- from utils.text_utils import detect_content_regions
76
- regions = detect_content_regions(img_np)
77
-
78
- # Create visualization with green borders around the text regions
79
- vis_img = img_np.copy()
80
-
81
- # Draw regions on visualization
82
- for x, y, w, h in regions:
83
- cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
84
-
85
- # Add text to indicate we're using adaptive processing
86
- font = cv2.FONT_HERSHEY_SIMPLEX
87
- cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
88
-
89
- # Create visualization images
90
- text_regions_vis = Image.fromarray(vis_img)
91
- image_regions_vis = text_regions_vis.copy()
92
-
93
- # Create a mask highlighting the text regions
94
- text_mask = np.zeros((img_height, img_width), dtype=np.uint8)
95
- for x, y, w, h in regions:
96
- text_mask[y:y+h, x:x+w] = 255
97
-
98
- _, buffer = cv2.imencode('.png', text_mask)
99
- text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
100
-
101
- # Extract region images
102
- region_images = []
103
- for i, (x, y, w, h) in enumerate(regions):
104
- region = img_np[y:y+h, x:x+w].copy()
105
- region_pil = Image.fromarray(region)
106
-
107
- region_info = {
108
- 'image': region,
109
- 'pil_image': region_pil,
110
- 'coordinates': (x, y, w, h),
111
- 'padded_coordinates': (x, y, w, h),
112
- 'order': i
113
- }
114
- region_images.append(region_info)
115
-
116
- # Return the adaptive segmentation results
117
- return {
118
- 'text_regions': text_regions_vis,
119
- 'image_regions': image_regions_vis,
120
- 'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
121
- 'combined_result': pil_img,
122
- 'text_regions_coordinates': regions,
123
- 'region_images': region_images,
124
- 'segmentation_type': 'adaptive'
125
- }
126
- else:
127
- # SIMPLIFIED APPROACH for most documents
128
- # Let Mistral OCR handle the entire document understanding process
129
- logger.info(f"Using standard approach for document with uniform text density")
130
-
131
- # For visualization, mark the entire image as a text region
132
- full_image_region = [(0, 0, img_width, img_height)]
133
-
134
- # Create visualization with a simple border
135
- vis_img = img_np.copy()
136
- cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
137
-
138
- # Add text to indicate this is using Mistral's native processing
139
- font = cv2.FONT_HERSHEY_SIMPLEX
140
- cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
141
-
142
- # Create visualizations and masks
143
- text_regions_vis = Image.fromarray(vis_img)
144
- image_regions_vis = text_regions_vis.copy()
145
-
146
- # Create a mask of the entire image (just for visualization)
147
- text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
148
- _, buffer = cv2.imencode('.png', text_mask)
149
- text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
150
-
151
- # Return the original image as the combined result
152
- return {
153
- 'text_regions': text_regions_vis,
154
- 'image_regions': image_regions_vis,
155
- 'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
156
- 'combined_result': pil_img,
157
- 'text_regions_coordinates': full_image_region,
158
- 'region_images': [{
159
- 'image': img_np,
160
- 'pil_image': pil_img,
161
- 'coordinates': (0, 0, img_width, img_height),
162
- 'padded_coordinates': (0, 0, img_width, img_height),
163
- 'order': 0
164
- }],
165
- 'segmentation_type': 'simplified'
166
- }
167
-
168
- except Exception as e:
169
- logger.error(f"Error segmenting image {image_file.name}: {str(e)}")
170
- # Return None values if processing fails
171
- return {
172
- 'text_regions': None,
173
- 'image_regions': None,
174
- 'text_mask_base64': None,
175
- 'combined_result': None,
176
- 'text_regions_coordinates': []
177
- }
178
-
179
- def process_segmented_image(image_path: Union[str, Path], output_dir: Optional[Path] = None, preserve_content: bool = True) -> Dict:
180
- """
181
- Process an image using segmentation for improved OCR, saving visualization outputs.
182
-
183
- Args:
184
- image_path: Path to the image file
185
- output_dir: Optional directory to save visualization outputs
186
-
187
- Returns:
188
- Dictionary with processing results and paths to output files
189
- """
190
- # Convert to Path object if string
191
- image_file = Path(image_path) if isinstance(image_path, str) else image_path
192
-
193
- # Create output directory if not provided
194
- if output_dir is None:
195
- output_dir = Path("output") / "segmentation"
196
- output_dir.mkdir(parents=True, exist_ok=True)
197
-
198
- # Process the image with segmentation
199
- segmentation_results = segment_image_for_ocr(image_file)
200
-
201
- # Prepare results dictionary
202
- results = {
203
- 'original_image': str(image_file),
204
- 'output_files': {}
205
- }
206
-
207
- # Save visualization outputs if segmentation was successful
208
- if segmentation_results['text_regions'] is not None:
209
- # Save text regions visualization
210
- text_regions_path = output_dir / f"{image_file.stem}_text_regions.jpg"
211
- segmentation_results['text_regions'].save(text_regions_path)
212
- results['output_files']['text_regions'] = str(text_regions_path)
213
-
214
- # Save image regions visualization
215
- image_regions_path = output_dir / f"{image_file.stem}_image_regions.jpg"
216
- segmentation_results['image_regions'].save(image_regions_path)
217
- results['output_files']['image_regions'] = str(image_regions_path)
218
-
219
- # Save combined result
220
- combined_path = output_dir / f"{image_file.stem}_combined.jpg"
221
- segmentation_results['combined_result'].save(combined_path)
222
- results['output_files']['combined_result'] = str(combined_path)
223
-
224
- # Save text mask visualization
225
- text_mask_path = output_dir / f"{image_file.stem}_text_mask.png"
226
- # Save text mask from base64
227
- if segmentation_results['text_mask_base64']:
228
- base64_data = segmentation_results['text_mask_base64'].split(',')[1]
229
- with open(text_mask_path, 'wb') as f:
230
- f.write(base64.b64decode(base64_data))
231
- results['output_files']['text_mask'] = str(text_mask_path)
232
-
233
- # Add detected text regions count
234
- results['text_regions_count'] = len(segmentation_results['text_regions_coordinates'])
235
- results['text_regions_coordinates'] = segmentation_results['text_regions_coordinates']
236
-
237
- return results
238
-
239
- if __name__ == "__main__":
240
- # Simple test - process a sample image if run directly
241
- import sys
242
-
243
- if len(sys.argv) > 1:
244
- image_path = sys.argv[1]
245
- else:
246
- image_path = "input/handwritten-journal.jpg" # Example image path"
247
-
248
- logger.info(f"Testing image segmentation on {image_path}")
249
- results = process_segmented_image(image_path)
250
-
251
- # Print results summary
252
- logger.info(f"Segmentation complete. Found {results.get('text_regions_count', 0)} text regions.")
253
- logger.info(f"Output files saved to: {[path for path in results.get('output_files', {}).values()]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
input/The Magician, or Bottle Cungerer.jpeg ADDED

Git LFS Details

  • SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
  • Pointer size: 132 Bytes
  • Size of remote file: 2.96 MB
input/baldwin-letter-1.jpg ADDED

Git LFS Details

  • SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
input/baldwin-letter-2.jpg ADDED

Git LFS Details

  • SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
input/flier.png ADDED
input/harpers.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c9030714b07bb5f7c9adf8b175975baa9b4f40402da62d69cad9b0d4ba61b94
3
- size 14931299
 
 
 
 
input/letter-1.jpg ADDED

Git LFS Details

  • SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
input/letter-2.jpg ADDED

Git LFS Details

  • SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
input/letter-3.jpg ADDED

Git LFS Details

  • SHA256: 7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
  • Pointer size: 131 Bytes
  • Size of remote file: 231 kB
input/magician-satire.jpg ADDED

Git LFS Details

  • SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
  • Pointer size: 132 Bytes
  • Size of remote file: 2.96 MB
input/menu.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
input/milgram-flier.png CHANGED

Git LFS Details

  • SHA256: 0e1ca2821304427dcf7e2c9e0a03de880f44146bf8fa6abc9a437249fda85486
  • Pointer size: 130 Bytes
  • Size of remote file: 88.5 kB
input/okeefe-menu.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
input/okeefe-recipe.jpg ADDED
input/recipe.jpg CHANGED

Git LFS Details

  • SHA256: 8bdb2a05dee10e4e181d8636714915f3055c664297e512f805fea180446624b2
  • Pointer size: 130 Bytes
  • Size of remote file: 70.8 kB