Spaces:
Running
Running
submit pull for merge
#1
by
milwright
- opened
This view is limited to 50 files because it contains too many changes.
See the raw diff here.
- .gitattributes +56 -4
- .gitignore +0 -44
- README.md +127 -35
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/constants.cpython-312.pyc +0 -0
- __pycache__/error_handler.cpython-312.pyc +0 -0
- __pycache__/image_segmentation.cpython-312.pyc +0 -0
- __pycache__/language_detection.cpython-312.pyc +0 -0
- __pycache__/ocr_processing.cpython-312.pyc +0 -0
- __pycache__/ocr_utils.cpython-312.pyc +0 -0
- __pycache__/preprocessing.cpython-312.pyc +0 -0
- __pycache__/process_file.cpython-312.pyc +0 -0
- __pycache__/structured_ocr.cpython-312.pyc +0 -0
- __pycache__/ui_components.cpython-312.pyc +0 -0
- __pycache__/utils.cpython-312.pyc +0 -0
- app.py +551 -554
- backup/app.py +535 -0
- backup/config.py +17 -0
- input/magician-or-bottle-cungerer.jpg → backup/input/The Magician, or Bottle Cungerer.jpeg +0 -0
- input/baldwin-15th-north.jpg → backup/input/baldwin-letter-1.jpg +0 -0
- input/americae-retectio.jpg → backup/input/baldwin-letter-2.jpg +2 -2
- backup/input/flier.png +0 -0
- input/baldwin-letter.jpg → backup/input/letter-1.jpg +2 -2
- input/gender.jpg → backup/input/letter-2.jpg +2 -2
- input/photo-baldwin-letter.jpg → backup/input/letter-3.jpg +2 -2
- backup/input/magellan-travels.jpg +3 -0
- input/handwritten-journal.jpg → backup/input/menu.pdf +2 -2
- backup/input/recipe.jpg +0 -0
- backup/ocr_utils.py +136 -0
- backup/pdf_ocr.py +76 -0
- backup/requirements.txt +10 -0
- backup/structured_ocr.py +414 -0
- config.py +9 -59
- constants.py +0 -193
- error_handler.py +0 -65
- image_segmentation.py +0 -253
- input/The Magician, or Bottle Cungerer.jpeg +3 -0
- input/baldwin-letter-1.jpg +3 -0
- input/baldwin-letter-2.jpg +3 -0
- input/flier.png +0 -0
- input/harpers.pdf +0 -3
- input/letter-1.jpg +3 -0
- input/letter-2.jpg +3 -0
- input/letter-3.jpg +3 -0
- input/magician-satire.jpg +3 -0
- input/menu.pdf +3 -0
- input/milgram-flier.png +0 -0
- input/okeefe-menu.pdf +3 -0
- input/okeefe-recipe.jpg +0 -0
- input/recipe.jpg +0 -0
.gitattributes
CHANGED
|
@@ -1,4 +1,56 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
-
*.
|
| 4 |
-
*.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
backup/input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
backup/input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
backup/input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
backup/input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
backup/input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
backup/input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
backup/input/menu.pdf filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
backup/input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
input/a-la-carte.pdf filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
input/handwritten-letter.jpg filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
input/magician-satire.jpg filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
input/menu.pdf filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
output/ymca-letter.jpg filter=lfs diff=lfs merge=lfs -text
|
.gitignore
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
# Python bytecode
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.py[cod]
|
| 4 |
-
*.class
|
| 5 |
-
|
| 6 |
-
# MacOS system files
|
| 7 |
-
.DS_Store
|
| 8 |
-
|
| 9 |
-
# Output and temporary files
|
| 10 |
-
output/debug/
|
| 11 |
-
output/comparison/
|
| 12 |
-
output/segmentation_test/text_regions/
|
| 13 |
-
output/preprocessing_test/
|
| 14 |
-
output/batch_test/
|
| 15 |
-
output/commonplace_improved/
|
| 16 |
-
output/commonplace_test/
|
| 17 |
-
output/preview/
|
| 18 |
-
logs/
|
| 19 |
-
*.backup
|
| 20 |
-
*.json
|
| 21 |
-
*.jpg
|
| 22 |
-
*.png
|
| 23 |
-
*.txt
|
| 24 |
-
*.csv
|
| 25 |
-
*.log
|
| 26 |
-
*.zip
|
| 27 |
-
*.tar
|
| 28 |
-
|
| 29 |
-
# Test files
|
| 30 |
-
test_*.py
|
| 31 |
-
test_*.sh
|
| 32 |
-
bug_fix_report.md
|
| 33 |
-
|
| 34 |
-
# Input samples (large binary files)
|
| 35 |
-
input/*.jpeg
|
| 36 |
-
input/*.jpg
|
| 37 |
-
input/*.png
|
| 38 |
-
input/*.pdf
|
| 39 |
-
|
| 40 |
-
# Temporary documents
|
| 41 |
-
Tmplf6xnkgr*
|
| 42 |
-
.env
|
| 43 |
-
output/pipeline_test/americae-retectio/americae-retectio_comparison.jpg
|
| 44 |
-
docs/environment_variables.md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,54 +1,146 @@
|
|
| 1 |
---
|
| 2 |
title: Historical OCR
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: streamlit
|
| 7 |
-
sdk_version: 1.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license:
|
| 11 |
-
short_description:
|
| 12 |
---
|
| 13 |
|
| 14 |
-
# Historical OCR
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
> **Note:** This tool is designed to assist scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy for all materials, it serves as a valuable research aid for navigating historical documents, particularly historical newspapers, handwritten documents, and photos of archival materials.
|
| 19 |
|
| 20 |
## Features
|
| 21 |
|
| 22 |
-
-
|
| 23 |
-
-
|
| 24 |
-
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
-
|
| 30 |
-
-
|
| 31 |
|
| 32 |
-
##
|
| 33 |
|
| 34 |
-
|
| 35 |
-
2. Add optional context or special instructions
|
| 36 |
-
3. Get detailed, structured OCR results with historical context
|
| 37 |
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
##
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
|
|
|
| 1 |
---
|
| 2 |
title: Historical OCR
|
| 3 |
+
emoji: 📜
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: streamlit
|
| 7 |
+
sdk_version: 1.43.2
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
short_description: Employs Mistral OCR for transcribing historical data
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# Historical Document OCR
|
| 15 |
|
| 16 |
+
This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
|
|
|
|
|
|
|
| 17 |
|
| 18 |
## Features
|
| 19 |
|
| 20 |
+
- OCR processing for both image and PDF files
|
| 21 |
+
- Automatic file type detection
|
| 22 |
+
- Structured output generation using Mistral models
|
| 23 |
+
- Interactive web interface with Streamlit
|
| 24 |
+
- Supports historical documents and manuscripts
|
| 25 |
+
- PDF preview functionality for better user experience
|
| 26 |
+
- Smart handling of large PDFs with automatic page limiting
|
| 27 |
+
- Robust error handling with helpful messages
|
| 28 |
+
- Image preprocessing options for enhanced OCR accuracy
|
| 29 |
|
| 30 |
+
## Project Structure
|
| 31 |
|
| 32 |
+
The project is organized as follows:
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
```
|
| 35 |
+
Historical OCR - Project Structure
|
| 36 |
|
| 37 |
+
┌─ Main Applications
|
| 38 |
+
│ ├─ app.py # Standard Streamlit interface for OCR processing
|
| 39 |
+
│ └─ streamlit_app.py # Educational modular version with learning components
|
| 40 |
+
│
|
| 41 |
+
├─ Core Functionality
|
| 42 |
+
│ ├─ structured_ocr.py # Main OCR processing engine with Mistral AI integration
|
| 43 |
+
│ ├─ ocr_utils.py # Utility functions for OCR text and image processing
|
| 44 |
+
│ ├─ pdf_ocr.py # PDF-specific document processing functionality
|
| 45 |
+
│ └─ config.py # Configuration settings and API keys
|
| 46 |
+
│
|
| 47 |
+
├─ Testing & Development
|
| 48 |
+
│ ├─ simple_test.py # Basic OCR functionality test
|
| 49 |
+
│ ├─ test_pdf.py # PDF processing test
|
| 50 |
+
│ ├─ test_pdf_preview.py # PDF preview generation test
|
| 51 |
+
│ └─ prepare_for_hf.py # Prepare project for Hugging Face deployment
|
| 52 |
+
│
|
| 53 |
+
├─ Scripts
|
| 54 |
+
│ ├─ run_local.sh # Launch standard or educational app locally
|
| 55 |
+
│ ├─ run_large_files.sh # Process large documents with optimized settings
|
| 56 |
+
│ └─ setup_git.sh # Configure Git repositories
|
| 57 |
+
│
|
| 58 |
+
├─ Educational Modules (streamlit/)
|
| 59 |
+
│ ├─ modules/
|
| 60 |
+
│ │ ├─ module1.py # Introduction and Problematization
|
| 61 |
+
│ │ ├─ module2.py # Historical Typography & OCR Challenges
|
| 62 |
+
│ │ ├─ module3.py # Document Analysis Techniques
|
| 63 |
+
│ │ ├─ module4.py # Processing Methods
|
| 64 |
+
│ │ ├─ module5.py # Research Applications
|
| 65 |
+
│ │ └─ module6.py # Future Directions
|
| 66 |
+
│ │
|
| 67 |
+
│ ├─ modular_app.py # Learning module framework
|
| 68 |
+
│ ├─ layout.py # UI components for educational interface
|
| 69 |
+
│ └─ process_file.py # File processing for educational app
|
| 70 |
+
│
|
| 71 |
+
├─ UI Components (ui/)
|
| 72 |
+
│ └─ layout.py # Shared UI components and styling
|
| 73 |
+
│
|
| 74 |
+
├─ Data Directories
|
| 75 |
+
│ ├─ input/ # Sample documents for testing/demo
|
| 76 |
+
│ └─ output/ # Output directory for processed files
|
| 77 |
+
│
|
| 78 |
+
└─ Dependencies
|
| 79 |
+
├─ requirements.txt # Python package dependencies
|
| 80 |
+
└─ packages.txt # System-level dependencies
|
| 81 |
+
```
|
| 82 |
|
| 83 |
+
## Setup for Local Development
|
| 84 |
|
| 85 |
+
1. Clone this repository
|
| 86 |
+
2. Install system dependencies:
|
| 87 |
+
- For PDF processing, you need poppler:
|
| 88 |
+
- On macOS: `brew install poppler`
|
| 89 |
+
- On Ubuntu/Debian: `apt-get install poppler-utils`
|
| 90 |
+
- On Windows: Download from [poppler releases](https://github.com/oschwartz10612/poppler-windows/releases/) and add to PATH
|
| 91 |
+
- For text recognition: `tesseract-ocr`
|
| 92 |
+
3. Install Python dependencies:
|
| 93 |
+
```
|
| 94 |
+
pip install -r requirements.txt
|
| 95 |
+
```
|
| 96 |
+
4. Set up your Mistral API key:
|
| 97 |
+
- Option 1: Create a `.env` file in this directory and add your Mistral API key:
|
| 98 |
+
```
|
| 99 |
+
MISTRAL_API_KEY=your_api_key_here
|
| 100 |
+
```
|
| 101 |
+
- Option 2: Set the `MISTRAL_API_KEY` environment variable directly:
|
| 102 |
+
```
|
| 103 |
+
export MISTRAL_API_KEY=your_api_key_here
|
| 104 |
+
```
|
| 105 |
+
- Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
|
| 106 |
+
5. Run the Streamlit app using the script:
|
| 107 |
+
```
|
| 108 |
+
./run_local.sh
|
| 109 |
+
```
|
| 110 |
+
Or directly:
|
| 111 |
+
```
|
| 112 |
+
streamlit run app.py
|
| 113 |
+
```
|
| 114 |
|
| 115 |
+
## Usage
|
| 116 |
+
|
| 117 |
+
1. Upload an image or PDF file using the file uploader
|
| 118 |
+
2. Select processing options in the sidebar (e.g., use vision model, image preprocessing)
|
| 119 |
+
3. Click "Process Document" to analyze the file
|
| 120 |
+
4. View the structured results and extract information
|
| 121 |
+
|
| 122 |
+
## Application Versions
|
| 123 |
+
|
| 124 |
+
Two versions of the application are available:
|
| 125 |
+
|
| 126 |
+
1. **Standard Version** (`app.py`): Focused on document processing with a clean interface
|
| 127 |
+
2. **Educational Version** (`streamlit_app.py`): Enhanced with educational modules and interactive components
|
| 128 |
+
|
| 129 |
+
To run the educational version:
|
| 130 |
+
```
|
| 131 |
+
streamlit run streamlit_app.py
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
## Deployment on Hugging Face Spaces
|
| 135 |
+
|
| 136 |
+
This app is designed to be deployed on Hugging Face Spaces. To deploy:
|
| 137 |
+
|
| 138 |
+
1. Fork this repository to your GitHub account or directly create a new Space on [Hugging Face](https://huggingface.co/spaces)
|
| 139 |
+
2. Connect your GitHub repository to your Hugging Face Space for automatic deployment
|
| 140 |
+
3. Add your Mistral API key as a secret in your Hugging Face Space settings:
|
| 141 |
+
- Secret name: `HF_MISTRAL_API_KEY`
|
| 142 |
+
- Secret value: Your Mistral API key
|
| 143 |
+
|
| 144 |
+
The `README.md` contains the necessary configuration metadata for Hugging Face Spaces.
|
| 145 |
|
| 146 |
+
Check out the configuration reference at [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces-config-reference)
|
__pycache__/config.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/config.cpython-312.pyc and b/__pycache__/config.cpython-312.pyc differ
|
|
|
__pycache__/constants.cpython-312.pyc
DELETED
|
Binary file (11.6 kB)
|
|
|
__pycache__/error_handler.cpython-312.pyc
DELETED
|
Binary file (3.2 kB)
|
|
|
__pycache__/image_segmentation.cpython-312.pyc
DELETED
|
Binary file (10.6 kB)
|
|
|
__pycache__/language_detection.cpython-312.pyc
DELETED
|
Binary file (18 kB)
|
|
|
__pycache__/ocr_processing.cpython-312.pyc
DELETED
|
Binary file (15.5 kB)
|
|
|
__pycache__/ocr_utils.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/ocr_utils.cpython-312.pyc and b/__pycache__/ocr_utils.cpython-312.pyc differ
|
|
|
__pycache__/preprocessing.cpython-312.pyc
DELETED
|
Binary file (9.21 kB)
|
|
|
__pycache__/process_file.cpython-312.pyc
ADDED
|
Binary file (2.86 kB). View file
|
|
|
__pycache__/structured_ocr.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/structured_ocr.cpython-312.pyc and b/__pycache__/structured_ocr.cpython-312.pyc differ
|
|
|
__pycache__/ui_components.cpython-312.pyc
DELETED
|
Binary file (44.1 kB)
|
|
|
__pycache__/utils.cpython-312.pyc
DELETED
|
Binary file (14.2 kB)
|
|
|
app.py
CHANGED
|
@@ -1,604 +1,601 @@
|
|
| 1 |
-
# Standard library imports
|
| 2 |
import os
|
|
|
|
| 3 |
import json
|
| 4 |
import sys
|
| 5 |
import time
|
| 6 |
-
import base64
|
| 7 |
-
import io
|
| 8 |
-
import logging
|
| 9 |
from pathlib import Path
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
# Third-party imports
|
| 18 |
-
import streamlit as st
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
from preprocessing import convert_pdf_to_images, preprocess_image
|
| 22 |
-
from ocr_processing import process_file
|
| 23 |
-
from ui.ui_components import (
|
| 24 |
-
ProgressReporter,
|
| 25 |
-
create_sidebar_options,
|
| 26 |
-
display_results,
|
| 27 |
-
create_file_uploader,
|
| 28 |
-
display_about_tab,
|
| 29 |
-
display_previous_results,
|
| 30 |
-
display_document_with_images
|
| 31 |
-
)
|
| 32 |
-
from utils import get_base64_from_image, handle_temp_files, format_timestamp
|
| 33 |
-
from error_handler import handle_ocr_error, check_file_size
|
| 34 |
-
from constants import (
|
| 35 |
-
MAX_FILE_SIZE_MB,
|
| 36 |
-
MAX_PAGES,
|
| 37 |
-
DOCUMENT_TYPES,
|
| 38 |
-
DOCUMENT_LAYOUTS,
|
| 39 |
-
CUSTOM_PROMPT_TEMPLATES,
|
| 40 |
-
LAYOUT_PROMPT_ADDITIONS
|
| 41 |
-
)
|
| 42 |
from structured_ocr import StructuredOCR
|
| 43 |
from config import MISTRAL_API_KEY
|
| 44 |
-
from utils.image_utils import create_results_zip
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# Set page configuration
|
| 50 |
st.set_page_config(
|
| 51 |
page_title="Historical OCR",
|
| 52 |
-
page_icon=
|
| 53 |
layout="wide",
|
| 54 |
-
initial_sidebar_state="
|
| 55 |
)
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# The current implementation uses:
|
| 67 |
-
# 1. A dedicated close_document() callback function triggered by the button's on_click
|
| 68 |
-
# 2. A flag-based approach (close_clicked) to handle cleanup on the next run cycle
|
| 69 |
-
# 3. Early cleanup detection and st.rerun() to ensure clean UI rendering
|
| 70 |
-
#
|
| 71 |
-
# Previous approaches using direct state manipulation or conditional rendering based on
|
| 72 |
-
# reset flags led to persistent UI elements and resource leaks.
|
| 73 |
-
#
|
| 74 |
-
# Consult https://docs.streamlit.io/library/advanced-features/session-state for details.
|
| 75 |
-
# ========================================================================================
|
| 76 |
-
|
| 77 |
-
def reset_document_state():
|
| 78 |
-
"""Reset only document-specific state variables
|
| 79 |
-
|
| 80 |
-
This function explicitly resets all document-related variables to ensure
|
| 81 |
-
clean state between document processing, preventing cached data issues.
|
| 82 |
-
"""
|
| 83 |
-
st.session_state.sample_document = None
|
| 84 |
-
st.session_state.original_sample_bytes = None
|
| 85 |
-
st.session_state.original_sample_name = None
|
| 86 |
-
st.session_state.original_sample_mime_type = None
|
| 87 |
-
st.session_state.is_sample_document = False
|
| 88 |
-
st.session_state.processed_document_active = False
|
| 89 |
-
st.session_state.sample_document_processed = False
|
| 90 |
-
st.session_state.sample_just_loaded = False
|
| 91 |
-
st.session_state.last_processed_file = None
|
| 92 |
-
st.session_state.selected_previous_result = None
|
| 93 |
-
# Keep temp_file_paths but ensure it's empty after cleanup
|
| 94 |
-
if 'temp_file_paths' in st.session_state:
|
| 95 |
-
st.session_state.temp_file_paths = []
|
| 96 |
-
|
| 97 |
-
def init_session_state():
|
| 98 |
-
"""Initialize session state variables if they don't already exist
|
| 99 |
-
|
| 100 |
-
This function follows Streamlit's recommended patterns for state initialization.
|
| 101 |
-
It only creates variables if they don't exist yet and doesn't modify existing values.
|
| 102 |
-
"""
|
| 103 |
-
# Initialize persistent app state variables
|
| 104 |
-
if 'previous_results' not in st.session_state:
|
| 105 |
-
st.session_state.previous_results = []
|
| 106 |
-
if 'temp_file_paths' not in st.session_state:
|
| 107 |
-
st.session_state.temp_file_paths = []
|
| 108 |
-
if 'auto_process_sample' not in st.session_state:
|
| 109 |
-
st.session_state.auto_process_sample = False
|
| 110 |
-
if 'close_clicked' not in st.session_state:
|
| 111 |
-
st.session_state.close_clicked = False
|
| 112 |
-
if 'active_tab' not in st.session_state:
|
| 113 |
-
st.session_state.active_tab = 0
|
| 114 |
-
|
| 115 |
-
# Initialize document-specific state variables
|
| 116 |
-
if 'last_processed_file' not in st.session_state:
|
| 117 |
-
st.session_state.last_processed_file = None
|
| 118 |
-
if 'sample_just_loaded' not in st.session_state:
|
| 119 |
-
st.session_state.sample_just_loaded = False
|
| 120 |
-
if 'processed_document_active' not in st.session_state:
|
| 121 |
-
st.session_state.processed_document_active = False
|
| 122 |
-
if 'sample_document_processed' not in st.session_state:
|
| 123 |
-
st.session_state.sample_document_processed = False
|
| 124 |
-
if 'sample_document' not in st.session_state:
|
| 125 |
-
st.session_state.sample_document = None
|
| 126 |
-
if 'original_sample_bytes' not in st.session_state:
|
| 127 |
-
st.session_state.original_sample_bytes = None
|
| 128 |
-
if 'original_sample_name' not in st.session_state:
|
| 129 |
-
st.session_state.original_sample_name = None
|
| 130 |
-
if 'is_sample_document' not in st.session_state:
|
| 131 |
-
st.session_state.is_sample_document = False
|
| 132 |
-
if 'selected_previous_result' not in st.session_state:
|
| 133 |
-
st.session_state.selected_previous_result = None
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
-
st.session_state.close_clicked = True
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
# Add a simplified info message about examples and CSS in the same markdown block
|
| 163 |
-
# to reduce spacing between elements
|
| 164 |
-
st.markdown("""
|
| 165 |
-
This app can process various historical documents:
|
| 166 |
-
- Historical photographs, maps, and manuscripts
|
| 167 |
-
- Handwritten letters and documents
|
| 168 |
-
- Printed books and articles
|
| 169 |
-
- Multi-page PDFs
|
| 170 |
-
|
| 171 |
-
<style>
|
| 172 |
-
/* Make the selectbox container match the full column width */
|
| 173 |
-
.main .block-container .element-container:has([data-testid="stSelectbox"]) {
|
| 174 |
-
width: 100% !important;
|
| 175 |
-
max-width: 100% !important;
|
| 176 |
-
margin-top: -12px !important; /* Reduce space between text and selectbox */
|
| 177 |
-
}
|
| 178 |
-
|
| 179 |
-
/* Make the actual selectbox control take the full width */
|
| 180 |
-
.stSelectbox > div > div {
|
| 181 |
-
width: 100% !important;
|
| 182 |
-
max-width: 100% !important;
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
-
/* Tighten spacing in the sample documents tab */
|
| 186 |
-
.main .block-container [data-testid="stVerticalBlock"] > div:nth-child(n+2) {
|
| 187 |
-
margin-top: 0.5rem !important;
|
| 188 |
-
}
|
| 189 |
-
</style>
|
| 190 |
-
""", unsafe_allow_html=True)
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
"Select a sample document",
|
| 205 |
-
"Restaurant Menu (PDF)",
|
| 206 |
-
"The Magician (Image)",
|
| 207 |
-
"Handwritten Letter (Image)",
|
| 208 |
-
"Magellan Travels (Image)",
|
| 209 |
-
"Milgram Flier (Image)",
|
| 210 |
-
"Historical Recipe (Image)"
|
| 211 |
-
]
|
| 212 |
-
|
| 213 |
-
# Initialize sample_document in session state if it doesn't exist
|
| 214 |
-
if 'sample_document' not in st.session_state:
|
| 215 |
-
st.session_state.sample_document = None
|
| 216 |
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
#
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
import requests
|
| 226 |
-
from io import BytesIO
|
| 227 |
-
|
| 228 |
-
with st.spinner(f"Downloading {sample_names[selected_sample]}..."):
|
| 229 |
-
response = requests.get(selected_url)
|
| 230 |
-
response.raise_for_status()
|
| 231 |
-
|
| 232 |
-
# Extract filename from URL
|
| 233 |
-
file_name = selected_url.split("/")[-1]
|
| 234 |
-
|
| 235 |
-
# Create a BytesIO object from the downloaded content
|
| 236 |
-
file_content = BytesIO(response.content)
|
| 237 |
-
|
| 238 |
-
# Store as a UploadedFile-like object in session state
|
| 239 |
-
class SampleDocument:
|
| 240 |
-
def __init__(self, name, content, content_type):
|
| 241 |
-
self.name = name
|
| 242 |
-
self._content = content
|
| 243 |
-
self.type = content_type
|
| 244 |
-
self.size = len(content)
|
| 245 |
-
|
| 246 |
-
def getvalue(self):
|
| 247 |
-
return self._content
|
| 248 |
-
|
| 249 |
-
def read(self):
|
| 250 |
-
return self._content
|
| 251 |
-
|
| 252 |
-
def seek(self, position):
|
| 253 |
-
# Implement seek for compatibility with some file operations
|
| 254 |
-
return
|
| 255 |
-
|
| 256 |
-
def tell(self):
|
| 257 |
-
# Implement tell for compatibility
|
| 258 |
-
return 0
|
| 259 |
-
|
| 260 |
-
# Determine content type based on file extension
|
| 261 |
-
if file_name.lower().endswith('.pdf'):
|
| 262 |
-
content_type = 'application/pdf'
|
| 263 |
-
elif file_name.lower().endswith(('.jpg', '.jpeg')):
|
| 264 |
-
content_type = 'image/jpeg'
|
| 265 |
-
elif file_name.lower().endswith('.png'):
|
| 266 |
-
content_type = 'image/png'
|
| 267 |
-
else:
|
| 268 |
-
content_type = 'application/octet-stream'
|
| 269 |
-
|
| 270 |
-
# Reset any document state before loading a new sample
|
| 271 |
-
if st.session_state.processed_document_active:
|
| 272 |
-
# Clean up any temporary files from previous processing
|
| 273 |
-
if st.session_state.temp_file_paths:
|
| 274 |
-
handle_temp_files(st.session_state.temp_file_paths)
|
| 275 |
-
|
| 276 |
-
# Reset all document-specific state variables
|
| 277 |
-
reset_document_state()
|
| 278 |
-
|
| 279 |
-
# Save download info in session state
|
| 280 |
-
st.session_state.sample_document = SampleDocument(
|
| 281 |
-
name=file_name,
|
| 282 |
-
content=response.content,
|
| 283 |
-
content_type=content_type
|
| 284 |
-
)
|
| 285 |
-
|
| 286 |
-
# Store original bytes for reprocessing with proper MIME type handling
|
| 287 |
-
st.session_state.original_sample_bytes = response.content
|
| 288 |
-
st.session_state.original_sample_name = file_name
|
| 289 |
-
st.session_state.original_sample_mime_type = content_type
|
| 290 |
-
|
| 291 |
-
# Set state flags
|
| 292 |
-
st.session_state.sample_just_loaded = True
|
| 293 |
-
st.session_state.is_sample_document = True
|
| 294 |
-
# Generate a unique identifier for the sample document
|
| 295 |
-
st.session_state.last_processed_file = f"{file_name}_{len(response.content)}"
|
| 296 |
-
|
| 297 |
-
# Set a flag to show redirect message
|
| 298 |
-
st.session_state.redirect_to_processing = True
|
| 299 |
-
st.rerun()
|
| 300 |
-
except Exception as e:
|
| 301 |
-
st.error(f"Error downloading sample document: {str(e)}")
|
| 302 |
-
st.info("Please try uploading your own document instead.")
|
| 303 |
-
else:
|
| 304 |
-
# If no sample is selected, clear the sample document in session state
|
| 305 |
-
st.session_state.sample_document = None
|
| 306 |
-
|
| 307 |
-
def process_document(uploaded_file, left_col, right_col, sidebar_options):
|
| 308 |
-
"""Process the uploaded document and display results"""
|
| 309 |
-
if uploaded_file is None:
|
| 310 |
-
return
|
| 311 |
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
if file_size_mb > MAX_FILE_SIZE_MB:
|
| 316 |
-
with left_col:
|
| 317 |
-
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is {MAX_FILE_SIZE_MB}MB.")
|
| 318 |
-
return
|
| 319 |
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
# Make sure last_processed_file is initialized
|
| 324 |
-
if 'last_processed_file' not in st.session_state:
|
| 325 |
-
st.session_state.last_processed_file = None
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
# Process button - flush left with similar padding as file browser
|
| 332 |
-
with left_col:
|
| 333 |
-
# Create a process button with minimal spacing to the uploader
|
| 334 |
-
st.markdown('<div style="padding: 0.2rem 0; min-width: 170px; margin-top: -10px; overflow: visible;">', unsafe_allow_html=True)
|
| 335 |
-
process_button = st.button("Process Document", key="process_document_btn")
|
| 336 |
-
st.markdown('</div>', unsafe_allow_html=True)
|
| 337 |
-
|
| 338 |
-
# Handle sample document recreation if needed
|
| 339 |
-
if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
|
| 340 |
-
# Recreate the uploaded file from stored bytes
|
| 341 |
-
from io import BytesIO
|
| 342 |
-
import mimetypes
|
| 343 |
-
|
| 344 |
-
# Determine mime type based on file extension
|
| 345 |
-
file_ext = os.path.splitext(st.session_state.original_sample_name)[1].lower()
|
| 346 |
-
if file_ext == '.pdf':
|
| 347 |
-
mime_type = 'application/pdf'
|
| 348 |
-
elif file_ext in ['.jpg', '.jpeg']:
|
| 349 |
-
mime_type = 'image/jpeg'
|
| 350 |
-
elif file_ext == '.png':
|
| 351 |
-
mime_type = 'image/png'
|
| 352 |
-
else:
|
| 353 |
-
mime_type = mimetypes.guess_type(st.session_state.original_sample_name)[0] or 'application/octet-stream'
|
| 354 |
|
| 355 |
-
#
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
'read': lambda: st.session_state.original_sample_bytes,
|
| 360 |
-
'seek': lambda x: None,
|
| 361 |
-
'type': mime_type
|
| 362 |
-
})
|
| 363 |
|
| 364 |
-
#
|
| 365 |
-
|
| 366 |
-
progress_placeholder = st.empty()
|
| 367 |
|
| 368 |
-
#
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
# Show preprocessing metadata in a well-formatted caption
|
| 384 |
-
meta_items = []
|
| 385 |
-
# Only include document type in the list if actual preprocessing is applied
|
| 386 |
-
has_active_preprocessing = (
|
| 387 |
-
sidebar_options["preprocessing_options"].get("grayscale", False) or
|
| 388 |
-
sidebar_options["preprocessing_options"].get("denoise", False) or
|
| 389 |
-
sidebar_options["preprocessing_options"].get("contrast", 0) != 0 or
|
| 390 |
-
sidebar_options["preprocessing_options"].get("rotation", 0) != 0
|
| 391 |
-
)
|
| 392 |
-
|
| 393 |
-
# Only show document type if there's actual preprocessing being applied
|
| 394 |
-
if has_active_preprocessing and sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
|
| 395 |
-
meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
|
| 396 |
-
if sidebar_options["preprocessing_options"].get("grayscale", False):
|
| 397 |
-
meta_items.append("Grayscale")
|
| 398 |
-
if sidebar_options["preprocessing_options"].get("denoise", False):
|
| 399 |
-
meta_items.append("Denoise")
|
| 400 |
-
if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
|
| 401 |
-
meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
|
| 402 |
-
if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
|
| 403 |
-
meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
|
| 404 |
-
|
| 405 |
-
# Only show "Applied:" if there are actual preprocessing steps
|
| 406 |
-
if meta_items:
|
| 407 |
-
meta_text = "Applied: " + ", ".join(meta_items)
|
| 408 |
-
st.caption(meta_text)
|
| 409 |
-
except Exception as e:
|
| 410 |
-
st.error(f"Error in preprocessing: {str(e)}")
|
| 411 |
-
st.info("Try using grayscale preprocessing for PNG images with transparency")
|
| 412 |
|
| 413 |
-
#
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
# Check if this is an auto-processing situation
|
| 417 |
-
auto_processing = st.session_state.auto_process_sample and not st.session_state.processed_document_active
|
| 418 |
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
if should_process:
|
| 429 |
-
# Reset auto-process flag to avoid processing on next rerun
|
| 430 |
-
if st.session_state.auto_process_sample:
|
| 431 |
-
st.session_state.auto_process_sample = False
|
| 432 |
-
|
| 433 |
-
# Move the progress indicator reference to just below the button
|
| 434 |
-
progress_reporter = ProgressReporter(progress_placeholder).setup()
|
| 435 |
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
use_segmentation=sidebar_options.get("use_segmentation", False)
|
| 454 |
-
)
|
| 455 |
-
|
| 456 |
-
# Ensure temp_file_paths in session state is updated with any new paths
|
| 457 |
-
# This is critical for proper resource cleanup when document is closed
|
| 458 |
-
if 'has_images' in result and result['has_images']:
|
| 459 |
-
logger.info("Document has images, ensuring temp files are tracked")
|
| 460 |
-
if 'temp_file_paths' not in st.session_state:
|
| 461 |
-
st.session_state.temp_file_paths = []
|
| 462 |
-
|
| 463 |
-
# Handle text-only OCR results (like the Milgram flier)
|
| 464 |
-
if ('ocr_contents' in result and
|
| 465 |
-
'raw_text' in result['ocr_contents'] and
|
| 466 |
-
len(result['ocr_contents']) <= 2 and # Only raw_text and possibly one other field
|
| 467 |
-
'has_images' not in result):
|
| 468 |
-
logger.info("Text-only OCR detected, handling as special case")
|
| 469 |
-
# Ensure raw_text is properly formatted as markdown
|
| 470 |
-
raw_text = result['ocr_contents']['raw_text']
|
| 471 |
-
# If we don't have other structured content, set a placeholder title
|
| 472 |
-
if 'title' not in result['ocr_contents']:
|
| 473 |
-
result['ocr_contents']['title'] = "Document Text"
|
| 474 |
-
|
| 475 |
-
# Display success message at the top of results, before any previews
|
| 476 |
-
with left_col:
|
| 477 |
-
# First show the success message (full width)
|
| 478 |
-
st.success("**Document processed successfully**")
|
| 479 |
-
|
| 480 |
-
# Then show the close button (also full width, positioned to left)
|
| 481 |
-
st.button("Close Document",
|
| 482 |
-
key="close_document_btn",
|
| 483 |
-
type="secondary",
|
| 484 |
-
on_click=close_document)
|
| 485 |
-
|
| 486 |
-
# Add a small spacer
|
| 487 |
-
st.markdown("<div style='height: 10px;'></div>", unsafe_allow_html=True)
|
| 488 |
-
|
| 489 |
-
# Display results
|
| 490 |
-
display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
|
| 491 |
-
|
| 492 |
-
# Set processed_document_active to True when a new document is processed
|
| 493 |
-
st.session_state.processed_document_active = True
|
| 494 |
-
|
| 495 |
-
# Clear the auto-processing message
|
| 496 |
-
auto_processing_message.empty()
|
| 497 |
-
|
| 498 |
-
# Store information about this processed file to track when new files are uploaded
|
| 499 |
-
if uploaded_file is not None:
|
| 500 |
-
st.session_state.last_processed_file = current_file_identifier
|
| 501 |
-
|
| 502 |
-
# Store the result in the previous results list
|
| 503 |
-
# Add timestamp to result for history tracking
|
| 504 |
-
result_copy = result.copy()
|
| 505 |
-
result_copy['timestamp'] = format_timestamp()
|
| 506 |
-
|
| 507 |
-
# Store if this was a sample document
|
| 508 |
-
if 'is_sample_document' in st.session_state and st.session_state.is_sample_document:
|
| 509 |
-
result_copy['sample_document'] = True
|
| 510 |
-
|
| 511 |
-
# Add to session state, keeping the most recent 20 results
|
| 512 |
-
st.session_state.previous_results.insert(0, result_copy)
|
| 513 |
-
if len(st.session_state.previous_results) > 20:
|
| 514 |
-
st.session_state.previous_results = st.session_state.previous_results[:20]
|
| 515 |
-
|
| 516 |
-
except Exception as e:
|
| 517 |
-
st.error(f"Error processing document: {str(e)}")
|
| 518 |
-
|
| 519 |
-
# Log the error
|
| 520 |
-
import logging
|
| 521 |
-
logging.error(f"Document processing error: {str(e)}", exc_info=True)
|
| 522 |
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
init_session_state()
|
| 527 |
|
| 528 |
-
#
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
if st.session_state.get('close_clicked', False):
|
| 533 |
-
# Reset the flag - cleanup has been handled
|
| 534 |
-
st.session_state.close_clicked = False
|
| 535 |
-
# Don't do anything else in this run - force a clean restart
|
| 536 |
-
st.rerun()
|
| 537 |
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
-
#
|
| 543 |
-
|
| 544 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
|
| 546 |
-
#
|
| 547 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 548 |
|
| 549 |
-
|
| 550 |
-
tab_names = ["Document Processing", "Sample Documents", "Learn More"]
|
| 551 |
-
main_tab1, main_tab2, main_tab3 = st.tabs(tab_names)
|
| 552 |
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
|
| 572 |
-
|
| 573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
|
| 575 |
-
#
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
|
| 588 |
-
#
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
# Run the application
|
| 603 |
-
if __name__ == "__main__":
|
| 604 |
-
main()
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import streamlit as st
|
| 3 |
import json
|
| 4 |
import sys
|
| 5 |
import time
|
|
|
|
|
|
|
|
|
|
| 6 |
from pathlib import Path
|
| 7 |
+
import tempfile
|
| 8 |
+
import io
|
| 9 |
+
from pdf2image import convert_from_bytes
|
| 10 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
| 11 |
+
import cv2
|
| 12 |
+
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
# Import the StructuredOCR class and config from the local files
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
from structured_ocr import StructuredOCR
|
| 16 |
from config import MISTRAL_API_KEY
|
|
|
|
| 17 |
|
| 18 |
+
# Check for modular UI components
|
| 19 |
+
try:
|
| 20 |
+
from ui.layout import tool_container, key_concept, research_question
|
| 21 |
+
MODULAR_UI = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
MODULAR_UI = False
|
| 24 |
|
| 25 |
# Set page configuration
|
| 26 |
st.set_page_config(
|
| 27 |
page_title="Historical OCR",
|
| 28 |
+
page_icon="📜",
|
| 29 |
layout="wide",
|
| 30 |
+
initial_sidebar_state="expanded"
|
| 31 |
)
|
| 32 |
|
| 33 |
+
# Enable caching for expensive operations
|
| 34 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 35 |
+
def convert_pdf_to_images(pdf_bytes, dpi=150):
|
| 36 |
+
"""Convert PDF bytes to a list of images with caching"""
|
| 37 |
+
try:
|
| 38 |
+
return convert_from_bytes(pdf_bytes, dpi=dpi)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
st.error(f"Error converting PDF: {str(e)}")
|
| 41 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 44 |
+
def preprocess_image(image_bytes, preprocessing_options):
|
| 45 |
+
"""Preprocess image with selected options"""
|
| 46 |
+
# Convert bytes to OpenCV format
|
| 47 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 48 |
+
img_array = np.array(image)
|
| 49 |
|
| 50 |
+
# Apply preprocessing based on selected options
|
| 51 |
+
if preprocessing_options.get("grayscale", False):
|
| 52 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 53 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
| 54 |
|
| 55 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
| 56 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
|
| 57 |
+
image = Image.fromarray(img_array)
|
| 58 |
+
enhancer = ImageEnhance.Contrast(image)
|
| 59 |
+
image = enhancer.enhance(contrast_factor)
|
| 60 |
+
img_array = np.array(image)
|
| 61 |
|
| 62 |
+
if preprocessing_options.get("denoise", False):
|
| 63 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
|
| 64 |
+
|
| 65 |
+
if preprocessing_options.get("threshold", False):
|
| 66 |
+
# Convert to grayscale if not already
|
| 67 |
+
if len(img_array.shape) == 3:
|
| 68 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 69 |
+
else:
|
| 70 |
+
gray = img_array
|
| 71 |
+
# Apply adaptive threshold
|
| 72 |
+
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 73 |
+
cv2.THRESH_BINARY, 11, 2)
|
| 74 |
+
# Convert back to RGB
|
| 75 |
+
img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
|
| 76 |
+
|
| 77 |
+
# Convert back to PIL Image
|
| 78 |
+
processed_image = Image.fromarray(img_array)
|
| 79 |
|
| 80 |
+
# Convert to bytes
|
| 81 |
+
byte_io = io.BytesIO()
|
| 82 |
+
processed_image.save(byte_io, format='PNG')
|
| 83 |
+
byte_io.seek(0)
|
| 84 |
|
| 85 |
+
return byte_io.getvalue()
|
|
|
|
| 86 |
|
| 87 |
+
# Define functions
|
| 88 |
+
def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
|
| 89 |
+
"""Process the uploaded file and return the OCR results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
Args:
|
| 92 |
+
uploaded_file: The uploaded file to process
|
| 93 |
+
use_vision: Whether to use vision model
|
| 94 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 95 |
+
"""
|
| 96 |
+
if preprocessing_options is None:
|
| 97 |
+
preprocessing_options = {}
|
| 98 |
+
|
| 99 |
+
# Show progress indicator
|
| 100 |
+
progress_bar = st.progress(0)
|
| 101 |
+
status_text = st.empty()
|
| 102 |
+
status_text.text("Preparing file for processing...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
# Save the uploaded file to a temporary file
|
| 105 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
| 106 |
+
tmp.write(uploaded_file.getvalue())
|
| 107 |
+
temp_path = tmp.name
|
| 108 |
|
| 109 |
+
try:
|
| 110 |
+
# Check if API key is available
|
| 111 |
+
if not MISTRAL_API_KEY:
|
| 112 |
+
# Return dummy data if no API key
|
| 113 |
+
progress_bar.progress(100)
|
| 114 |
+
status_text.empty()
|
| 115 |
+
return {
|
| 116 |
+
"file_name": uploaded_file.name,
|
| 117 |
+
"topics": ["Sample Document"],
|
| 118 |
+
"languages": ["English"],
|
| 119 |
+
"ocr_contents": {
|
| 120 |
+
"title": "Sample Document",
|
| 121 |
+
"content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
|
| 125 |
+
# Update progress
|
| 126 |
+
progress_bar.progress(20)
|
| 127 |
+
status_text.text("Initializing OCR processor...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
+
# Initialize OCR processor
|
| 130 |
+
processor = StructuredOCR()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
+
# Determine file type from extension
|
| 133 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
| 134 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
+
# Apply preprocessing if needed
|
| 137 |
+
if any(preprocessing_options.values()) and file_type == "image":
|
| 138 |
+
status_text.text("Applying image preprocessing...")
|
| 139 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
# Save processed image to temp file
|
| 142 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
|
| 143 |
+
proc_tmp.write(processed_bytes)
|
| 144 |
+
temp_path = proc_tmp.name
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
+
# Get file size in MB
|
| 147 |
+
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
|
|
|
|
| 148 |
|
| 149 |
+
# Check if file exceeds API limits (50 MB)
|
| 150 |
+
if file_size_mb > 50:
|
| 151 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
|
| 152 |
+
return {
|
| 153 |
+
"file_name": uploaded_file.name,
|
| 154 |
+
"topics": ["Document"],
|
| 155 |
+
"languages": ["English"],
|
| 156 |
+
"confidence_score": 0.0,
|
| 157 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
| 158 |
+
"ocr_contents": {
|
| 159 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
| 160 |
+
"partial_text": "Document could not be processed due to size limitations."
|
| 161 |
+
}
|
| 162 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
+
# Update progress
|
| 165 |
+
progress_bar.progress(40)
|
| 166 |
+
status_text.text("Processing document with OCR...")
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
# Process the file with file size information for automatic page limiting
|
| 169 |
+
# Make sure we're using the latest mistral-ocr model
|
| 170 |
+
# See https://docs.mistral.ai/capabilities/document/ for more info
|
| 171 |
+
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
|
| 172 |
|
| 173 |
+
# Complete progress
|
| 174 |
+
progress_bar.progress(100)
|
| 175 |
+
status_text.empty()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
return result
|
| 178 |
+
except Exception as e:
|
| 179 |
+
progress_bar.progress(100)
|
| 180 |
+
status_text.empty()
|
| 181 |
+
st.error(f"Error during processing: {str(e)}")
|
| 182 |
+
raise
|
| 183 |
+
finally:
|
| 184 |
+
# Clean up the temporary file
|
| 185 |
+
if os.path.exists(temp_path):
|
| 186 |
+
os.unlink(temp_path)
|
| 187 |
+
|
| 188 |
+
# App title and description
|
| 189 |
+
st.title("Historical Document OCR")
|
| 190 |
+
st.subheader("Powered by Mistral AI")
|
| 191 |
+
|
| 192 |
+
# Create main layout with tabs and columns
|
| 193 |
+
main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
with main_tab1:
|
| 196 |
+
# Create a two-column layout for file upload and preview
|
| 197 |
+
upload_col, preview_col = st.columns([1, 1])
|
|
|
|
| 198 |
|
| 199 |
+
# File uploader in the left column
|
| 200 |
+
with upload_col:
|
| 201 |
+
st.markdown("""
|
| 202 |
+
Upload an image or PDF file to get started.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
+
Using the latest `mistral-ocr-latest` model for advanced document understanding.
|
| 205 |
+
""")
|
| 206 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 50MB per file")
|
| 207 |
+
|
| 208 |
+
# Sidebar with options
|
| 209 |
+
with st.sidebar:
|
| 210 |
+
st.header("Options")
|
| 211 |
+
|
| 212 |
+
# Model options
|
| 213 |
+
st.subheader("Model Settings")
|
| 214 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
| 215 |
+
help="For image files, use the vision model for improved analysis (may be slower)")
|
| 216 |
|
| 217 |
+
# Image preprocessing options (collapsible)
|
| 218 |
+
st.subheader("Image Preprocessing")
|
| 219 |
+
with st.expander("Preprocessing Options"):
|
| 220 |
+
preprocessing_options = {}
|
| 221 |
+
preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
|
| 222 |
+
help="Convert image to grayscale before OCR")
|
| 223 |
+
preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
|
| 224 |
+
help="Apply adaptive thresholding to enhance text")
|
| 225 |
+
preprocessing_options["denoise"] = st.checkbox("Denoise Image",
|
| 226 |
+
help="Remove noise from the image")
|
| 227 |
+
preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
|
| 228 |
+
help="Adjust image contrast (-5 to +5)")
|
| 229 |
|
| 230 |
+
# PDF options (collapsible)
|
| 231 |
+
st.subheader("PDF Options")
|
| 232 |
+
with st.expander("PDF Settings"):
|
| 233 |
+
pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
|
| 234 |
+
help="Higher DPI gives better quality but slower processing")
|
| 235 |
+
max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
|
| 236 |
+
help="Limit number of pages to process")
|
| 237 |
+
|
| 238 |
+
# About tab content
|
| 239 |
+
with main_tab2:
|
| 240 |
+
st.markdown("""
|
| 241 |
+
### About This Application
|
| 242 |
|
| 243 |
+
This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
It can process:
|
| 246 |
+
- Image files (jpg, png, etc.)
|
| 247 |
+
- PDF documents (multi-page support)
|
| 248 |
+
|
| 249 |
+
The extracted content is processed into structured data based on the document type, combining:
|
| 250 |
+
- Text extraction with `mistral-ocr-latest`
|
| 251 |
+
- Analysis with language models
|
| 252 |
+
- Layout preservation with images
|
| 253 |
+
|
| 254 |
+
View results in three formats:
|
| 255 |
+
- Structured HTML view
|
| 256 |
+
- Raw JSON (for developers)
|
| 257 |
+
- Markdown with images (preserves document layout)
|
| 258 |
+
|
| 259 |
+
**New Features:**
|
| 260 |
+
- Image preprocessing for better OCR quality
|
| 261 |
+
- PDF resolution and page controls
|
| 262 |
+
- Progress tracking during processing
|
| 263 |
+
""")
|
| 264 |
+
|
| 265 |
+
with main_tab1:
|
| 266 |
+
if uploaded_file is not None:
|
| 267 |
+
# Check file size (cap at 50MB)
|
| 268 |
+
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
| 269 |
|
| 270 |
+
if file_size_mb > 50:
|
| 271 |
+
with upload_col:
|
| 272 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
|
| 273 |
+
st.stop()
|
| 274 |
+
|
| 275 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
| 276 |
+
|
| 277 |
+
# Display document preview in preview column
|
| 278 |
+
with preview_col:
|
| 279 |
+
st.subheader("Document Preview")
|
| 280 |
+
if file_ext == ".pdf":
|
| 281 |
+
try:
|
| 282 |
+
# Convert first page of PDF to image for preview
|
| 283 |
+
pdf_bytes = uploaded_file.getvalue()
|
| 284 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
| 285 |
+
|
| 286 |
+
if images:
|
| 287 |
+
# Convert PIL image to bytes for Streamlit
|
| 288 |
+
first_page = images[0]
|
| 289 |
+
img_bytes = io.BytesIO()
|
| 290 |
+
first_page.save(img_bytes, format='JPEG')
|
| 291 |
+
img_bytes.seek(0)
|
| 292 |
+
|
| 293 |
+
# Display the PDF preview
|
| 294 |
+
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
|
| 295 |
+
else:
|
| 296 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
| 297 |
+
except Exception:
|
| 298 |
+
# Simply show the file name without an error message
|
| 299 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
| 300 |
+
st.info("Click 'Process Document' to analyze the content.")
|
| 301 |
+
else:
|
| 302 |
+
st.image(uploaded_file, use_container_width=True)
|
| 303 |
+
|
| 304 |
+
# Add image preprocessing preview in a collapsible section if needed
|
| 305 |
+
if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
|
| 306 |
+
with st.expander("Image Preprocessing Preview"):
|
| 307 |
+
preview_cols = st.columns(2)
|
| 308 |
|
| 309 |
+
with preview_cols[0]:
|
| 310 |
+
st.markdown("**Original Image**")
|
| 311 |
+
st.image(uploaded_file, use_container_width=True)
|
| 312 |
|
| 313 |
+
with preview_cols[1]:
|
| 314 |
+
st.markdown("**Preprocessed Image**")
|
| 315 |
+
try:
|
| 316 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
| 317 |
+
st.image(io.BytesIO(processed_bytes), use_container_width=True)
|
| 318 |
+
except Exception as e:
|
| 319 |
+
st.error(f"Error in preprocessing: {str(e)}")
|
| 320 |
+
|
| 321 |
+
# Process button - flush left with similar padding as file browser
|
| 322 |
+
with upload_col:
|
| 323 |
+
process_button = st.button("Process Document", use_container_width=True)
|
| 324 |
+
|
| 325 |
+
# Results section
|
| 326 |
+
if process_button:
|
| 327 |
+
try:
|
| 328 |
+
# Get max_pages or default if not available
|
| 329 |
+
max_pages_value = max_pages if 'max_pages' in locals() else None
|
| 330 |
|
| 331 |
+
# Call process_file with all options
|
| 332 |
+
result = process_file(uploaded_file, use_vision, preprocessing_options)
|
| 333 |
+
|
| 334 |
+
# Create results tabs for better organization
|
| 335 |
+
results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
|
| 336 |
+
|
| 337 |
+
with results_tab1:
|
| 338 |
+
# Create two columns for metadata and content
|
| 339 |
+
meta_col, content_col = st.columns([1, 2])
|
| 340 |
+
|
| 341 |
+
with meta_col:
|
| 342 |
+
st.subheader("Document Metadata")
|
| 343 |
+
st.success("**Document processed successfully**")
|
| 344 |
+
|
| 345 |
+
# Display file info
|
| 346 |
+
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
| 347 |
+
|
| 348 |
+
# Display info if only limited pages were processed
|
| 349 |
+
if 'limited_pages' in result:
|
| 350 |
+
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
|
| 351 |
+
|
| 352 |
+
# Display languages if available
|
| 353 |
+
if 'languages' in result:
|
| 354 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
| 355 |
+
if languages:
|
| 356 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
| 357 |
+
|
| 358 |
+
# Confidence score if available
|
| 359 |
+
if 'confidence_score' in result:
|
| 360 |
+
confidence = result['confidence_score']
|
| 361 |
+
st.write(f"**OCR Confidence:** {confidence:.1%}")
|
| 362 |
+
|
| 363 |
+
# Display topics if available
|
| 364 |
+
if 'topics' in result and result['topics']:
|
| 365 |
+
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
| 366 |
+
|
| 367 |
+
with content_col:
|
| 368 |
+
st.subheader("Document Contents")
|
| 369 |
+
if 'ocr_contents' in result:
|
| 370 |
+
# Check if there are images in the OCR result
|
| 371 |
+
has_images = False
|
| 372 |
+
if 'raw_response' in result:
|
| 373 |
+
try:
|
| 374 |
+
has_images = any(page.images for page in result['raw_response'].pages)
|
| 375 |
+
except Exception:
|
| 376 |
+
has_images = False
|
| 377 |
+
|
| 378 |
+
# Create tabs for different views
|
| 379 |
+
if has_images:
|
| 380 |
+
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
|
| 381 |
+
else:
|
| 382 |
+
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
|
| 383 |
+
|
| 384 |
+
with view_tab1:
|
| 385 |
+
# Display in a more user-friendly format based on the content structure
|
| 386 |
+
html_content = ""
|
| 387 |
+
if isinstance(result['ocr_contents'], dict):
|
| 388 |
+
for section, content in result['ocr_contents'].items():
|
| 389 |
+
if content: # Only display non-empty sections
|
| 390 |
+
section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
|
| 391 |
+
html_content += section_title
|
| 392 |
+
|
| 393 |
+
if isinstance(content, str):
|
| 394 |
+
html_content += f"<p>{content}</p>"
|
| 395 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 396 |
+
st.markdown(content)
|
| 397 |
+
elif isinstance(content, list):
|
| 398 |
+
html_list = "<ul>"
|
| 399 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 400 |
+
for item in content:
|
| 401 |
+
if isinstance(item, str):
|
| 402 |
+
html_list += f"<li>{item}</li>"
|
| 403 |
+
st.markdown(f"- {item}")
|
| 404 |
+
elif isinstance(item, dict):
|
| 405 |
+
html_list += f"<li>{json.dumps(item)}</li>"
|
| 406 |
+
st.json(item)
|
| 407 |
+
html_list += "</ul>"
|
| 408 |
+
html_content += html_list
|
| 409 |
+
elif isinstance(content, dict):
|
| 410 |
+
html_dict = "<dl>"
|
| 411 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 412 |
+
for k, v in content.items():
|
| 413 |
+
html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
|
| 414 |
+
st.markdown(f"**{k}:** {v}")
|
| 415 |
+
html_dict += "</dl>"
|
| 416 |
+
html_content += html_dict
|
| 417 |
+
|
| 418 |
+
# Add download button in a smaller section
|
| 419 |
+
with st.expander("Export Content"):
|
| 420 |
+
# Alternative download button
|
| 421 |
+
html_bytes = html_content.encode()
|
| 422 |
+
st.download_button(
|
| 423 |
+
label="Download as HTML",
|
| 424 |
+
data=html_bytes,
|
| 425 |
+
file_name="document_content.html",
|
| 426 |
+
mime="text/html"
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
with view_tab2:
|
| 430 |
+
# Show the raw JSON for developers
|
| 431 |
+
st.json(result)
|
| 432 |
+
|
| 433 |
+
if has_images:
|
| 434 |
+
with view_tab3:
|
| 435 |
+
# Show loading indicator while preparing images
|
| 436 |
+
with st.spinner("Preparing document with embedded images..."):
|
| 437 |
+
try:
|
| 438 |
+
# Import function
|
| 439 |
+
try:
|
| 440 |
+
from ocr_utils import get_combined_markdown
|
| 441 |
+
except ImportError:
|
| 442 |
+
st.error("Required module ocr_utils not found.")
|
| 443 |
+
st.stop()
|
| 444 |
+
|
| 445 |
+
# Check if raw_response is available
|
| 446 |
+
if 'raw_response' not in result:
|
| 447 |
+
st.warning("Raw OCR response not available. Cannot display images.")
|
| 448 |
+
st.stop()
|
| 449 |
+
|
| 450 |
+
# Validate the raw_response structure before processing
|
| 451 |
+
if not hasattr(result['raw_response'], 'pages'):
|
| 452 |
+
st.warning("Invalid OCR response format. Cannot display images.")
|
| 453 |
+
st.stop()
|
| 454 |
+
|
| 455 |
+
# Get the combined markdown with images
|
| 456 |
+
# Set a flag to compress images if needed
|
| 457 |
+
compress_images = True
|
| 458 |
+
max_image_width = 800 # Maximum width for images
|
| 459 |
+
|
| 460 |
+
try:
|
| 461 |
+
# First try to get combined markdown with compressed images
|
| 462 |
+
if compress_images and hasattr(result['raw_response'], 'pages'):
|
| 463 |
+
from ocr_utils import get_combined_markdown_compressed
|
| 464 |
+
combined_markdown = get_combined_markdown_compressed(
|
| 465 |
+
result['raw_response'],
|
| 466 |
+
max_width=max_image_width,
|
| 467 |
+
quality=85
|
| 468 |
+
)
|
| 469 |
+
else:
|
| 470 |
+
# Fall back to regular method if compression not available
|
| 471 |
+
combined_markdown = get_combined_markdown(result['raw_response'])
|
| 472 |
+
except (ImportError, AttributeError):
|
| 473 |
+
# Fall back to regular method
|
| 474 |
+
combined_markdown = get_combined_markdown(result['raw_response'])
|
| 475 |
+
|
| 476 |
+
if not combined_markdown or combined_markdown.strip() == "":
|
| 477 |
+
st.warning("No image content found in the document.")
|
| 478 |
+
st.stop()
|
| 479 |
+
|
| 480 |
+
# Check if there are many images that might cause loading issues
|
| 481 |
+
image_count = sum(len(page.images) for page in result['raw_response'].pages if hasattr(page, 'images'))
|
| 482 |
+
|
| 483 |
+
# Add warning for image-heavy documents
|
| 484 |
+
if image_count > 10:
|
| 485 |
+
st.warning(f"This document contains {image_count} images. Rendering may take longer than usual.")
|
| 486 |
+
|
| 487 |
+
# Add CSS to ensure proper spacing and handling of text and images
|
| 488 |
+
st.markdown("""
|
| 489 |
+
<style>
|
| 490 |
+
.markdown-text-container {
|
| 491 |
+
padding: 10px;
|
| 492 |
+
background-color: #f9f9f9;
|
| 493 |
+
border-radius: 5px;
|
| 494 |
+
}
|
| 495 |
+
.markdown-text-container img {
|
| 496 |
+
margin: 15px 0;
|
| 497 |
+
max-width: 100%;
|
| 498 |
+
border: 1px solid #ddd;
|
| 499 |
+
border-radius: 4px;
|
| 500 |
+
display: block;
|
| 501 |
+
}
|
| 502 |
+
.markdown-text-container p {
|
| 503 |
+
margin-bottom: 16px;
|
| 504 |
+
line-height: 1.6;
|
| 505 |
+
}
|
| 506 |
+
/* Add lazy loading for images to improve performance */
|
| 507 |
+
.markdown-text-container img {
|
| 508 |
+
loading: lazy;
|
| 509 |
+
}
|
| 510 |
+
</style>
|
| 511 |
+
""", unsafe_allow_html=True)
|
| 512 |
+
|
| 513 |
+
# For very image-heavy documents, show images in a paginated way
|
| 514 |
+
if image_count > 20:
|
| 515 |
+
# Show image content in a paginated way
|
| 516 |
+
st.write("Document contains many images. Showing in a paginated format:")
|
| 517 |
+
|
| 518 |
+
# Split the combined markdown by page separators
|
| 519 |
+
pages = combined_markdown.split("---")
|
| 520 |
+
|
| 521 |
+
# Create a page selector
|
| 522 |
+
page_num = st.selectbox("Select page to view:",
|
| 523 |
+
options=list(range(1, len(pages)+1)),
|
| 524 |
+
index=0)
|
| 525 |
+
|
| 526 |
+
# Display only the selected page
|
| 527 |
+
st.markdown(f"""
|
| 528 |
+
<div class="markdown-text-container">
|
| 529 |
+
{pages[page_num-1]}
|
| 530 |
+
</div>
|
| 531 |
+
""", unsafe_allow_html=True)
|
| 532 |
+
|
| 533 |
+
# Add note about pagination
|
| 534 |
+
st.info(f"Showing page {page_num} of {len(pages)}. Select a different page from the dropdown above.")
|
| 535 |
+
else:
|
| 536 |
+
# Wrap the markdown in a div with the class for styling
|
| 537 |
+
st.markdown(f"""
|
| 538 |
+
<div class="markdown-text-container">
|
| 539 |
+
{combined_markdown}
|
| 540 |
+
</div>
|
| 541 |
+
""", unsafe_allow_html=True)
|
| 542 |
+
|
| 543 |
+
# Add a download button for the combined content
|
| 544 |
+
st.download_button(
|
| 545 |
+
label="Download with Images (HTML)",
|
| 546 |
+
data=f"""
|
| 547 |
+
<html>
|
| 548 |
+
<head>
|
| 549 |
+
<style>
|
| 550 |
+
body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
|
| 551 |
+
img {{ max-width: 100%; margin: 15px 0; }}
|
| 552 |
+
</style>
|
| 553 |
+
</head>
|
| 554 |
+
<body>
|
| 555 |
+
{combined_markdown}
|
| 556 |
+
</body>
|
| 557 |
+
</html>
|
| 558 |
+
""",
|
| 559 |
+
file_name="document_with_images.html",
|
| 560 |
+
mime="text/html"
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
except Exception as e:
|
| 564 |
+
st.error(f"Could not display document with images: {str(e)}")
|
| 565 |
+
st.info("Try refreshing or processing the document again.")
|
| 566 |
+
else:
|
| 567 |
+
st.error("No OCR content was extracted from the document.")
|
| 568 |
+
|
| 569 |
+
with results_tab2:
|
| 570 |
+
st.subheader("Raw Processing Results")
|
| 571 |
+
st.json(result)
|
| 572 |
+
|
| 573 |
+
except Exception as e:
|
| 574 |
+
st.error(f"Error processing document: {str(e)}")
|
| 575 |
+
else:
|
| 576 |
+
# Display sample images in the main area when no file is uploaded
|
| 577 |
+
st.info("Upload a document to get started using the file uploader above.")
|
| 578 |
+
|
| 579 |
+
# Show example images in a grid
|
| 580 |
+
st.subheader("Example Documents")
|
| 581 |
|
| 582 |
+
# Add a sample images container
|
| 583 |
+
with st.container():
|
| 584 |
+
# Find sample images from the input directory to display
|
| 585 |
+
input_dir = Path(__file__).parent / "input"
|
| 586 |
+
sample_images = []
|
| 587 |
+
if input_dir.exists():
|
| 588 |
+
# Find valid jpg files (with size > 50KB to avoid placeholders)
|
| 589 |
+
sample_images = [
|
| 590 |
+
path for path in input_dir.glob("*.jpg")
|
| 591 |
+
if path.stat().st_size > 50000
|
| 592 |
+
][:3] # Limit to 3 samples
|
| 593 |
|
| 594 |
+
if sample_images:
|
| 595 |
+
columns = st.columns(3)
|
| 596 |
+
for i, img_path in enumerate(sample_images):
|
| 597 |
+
with columns[i % 3]:
|
| 598 |
+
try:
|
| 599 |
+
st.image(str(img_path), caption=img_path.name, use_container_width=True)
|
| 600 |
+
except Exception as e:
|
| 601 |
+
st.error(f"Error loading image {img_path.name}: {str(e)}")
|
|
|
|
|
|
|
|
|
backup/app.py
ADDED
|
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import json
|
| 4 |
+
import sys
|
| 5 |
+
import time
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import tempfile
|
| 8 |
+
import io
|
| 9 |
+
from pdf2image import convert_from_bytes
|
| 10 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
| 11 |
+
import cv2
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
# Import the StructuredOCR class and config from the local files
|
| 15 |
+
from structured_ocr import StructuredOCR
|
| 16 |
+
from config import MISTRAL_API_KEY
|
| 17 |
+
|
| 18 |
+
# Set page configuration
|
| 19 |
+
st.set_page_config(
|
| 20 |
+
page_title="Historical OCR",
|
| 21 |
+
page_icon="🚀",
|
| 22 |
+
layout="wide",
|
| 23 |
+
initial_sidebar_state="expanded"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Enable caching for expensive operations
|
| 27 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 28 |
+
def convert_pdf_to_images(pdf_bytes, dpi=150):
|
| 29 |
+
"""Convert PDF bytes to a list of images with caching"""
|
| 30 |
+
try:
|
| 31 |
+
return convert_from_bytes(pdf_bytes, dpi=dpi)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
st.error(f"Error converting PDF: {str(e)}")
|
| 34 |
+
return []
|
| 35 |
+
|
| 36 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 37 |
+
def preprocess_image(image_bytes, preprocessing_options):
|
| 38 |
+
"""Preprocess image with selected options"""
|
| 39 |
+
# Convert bytes to OpenCV format
|
| 40 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 41 |
+
img_array = np.array(image)
|
| 42 |
+
|
| 43 |
+
# Apply preprocessing based on selected options
|
| 44 |
+
if preprocessing_options.get("grayscale", False):
|
| 45 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 46 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
| 47 |
+
|
| 48 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
| 49 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
|
| 50 |
+
image = Image.fromarray(img_array)
|
| 51 |
+
enhancer = ImageEnhance.Contrast(image)
|
| 52 |
+
image = enhancer.enhance(contrast_factor)
|
| 53 |
+
img_array = np.array(image)
|
| 54 |
+
|
| 55 |
+
if preprocessing_options.get("denoise", False):
|
| 56 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
|
| 57 |
+
|
| 58 |
+
if preprocessing_options.get("threshold", False):
|
| 59 |
+
# Convert to grayscale if not already
|
| 60 |
+
if len(img_array.shape) == 3:
|
| 61 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
| 62 |
+
else:
|
| 63 |
+
gray = img_array
|
| 64 |
+
# Apply adaptive threshold
|
| 65 |
+
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 66 |
+
cv2.THRESH_BINARY, 11, 2)
|
| 67 |
+
# Convert back to RGB
|
| 68 |
+
img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
|
| 69 |
+
|
| 70 |
+
# Convert back to PIL Image
|
| 71 |
+
processed_image = Image.fromarray(img_array)
|
| 72 |
+
|
| 73 |
+
# Convert to bytes
|
| 74 |
+
byte_io = io.BytesIO()
|
| 75 |
+
processed_image.save(byte_io, format='PNG')
|
| 76 |
+
byte_io.seek(0)
|
| 77 |
+
|
| 78 |
+
return byte_io.getvalue()
|
| 79 |
+
|
| 80 |
+
# Define functions
|
| 81 |
+
def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
|
| 82 |
+
"""Process the uploaded file and return the OCR results
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
uploaded_file: The uploaded file to process
|
| 86 |
+
use_vision: Whether to use vision model
|
| 87 |
+
preprocessing_options: Dictionary of preprocessing options
|
| 88 |
+
"""
|
| 89 |
+
if preprocessing_options is None:
|
| 90 |
+
preprocessing_options = {}
|
| 91 |
+
|
| 92 |
+
# Show progress indicator
|
| 93 |
+
progress_bar = st.progress(0)
|
| 94 |
+
status_text = st.empty()
|
| 95 |
+
status_text.text("Preparing file for processing...")
|
| 96 |
+
|
| 97 |
+
# Save the uploaded file to a temporary file
|
| 98 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
| 99 |
+
tmp.write(uploaded_file.getvalue())
|
| 100 |
+
temp_path = tmp.name
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
# Check if API key is available
|
| 104 |
+
if not MISTRAL_API_KEY:
|
| 105 |
+
# Return dummy data if no API key
|
| 106 |
+
progress_bar.progress(100)
|
| 107 |
+
status_text.empty()
|
| 108 |
+
return {
|
| 109 |
+
"file_name": uploaded_file.name,
|
| 110 |
+
"topics": ["Sample Document"],
|
| 111 |
+
"languages": ["English"],
|
| 112 |
+
"ocr_contents": {
|
| 113 |
+
"title": "Sample Document",
|
| 114 |
+
"content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
|
| 115 |
+
}
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Update progress
|
| 119 |
+
progress_bar.progress(20)
|
| 120 |
+
status_text.text("Initializing OCR processor...")
|
| 121 |
+
|
| 122 |
+
# Initialize OCR processor
|
| 123 |
+
processor = StructuredOCR()
|
| 124 |
+
|
| 125 |
+
# Determine file type from extension
|
| 126 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
| 127 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
| 128 |
+
|
| 129 |
+
# Apply preprocessing if needed
|
| 130 |
+
if any(preprocessing_options.values()) and file_type == "image":
|
| 131 |
+
status_text.text("Applying image preprocessing...")
|
| 132 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
| 133 |
+
|
| 134 |
+
# Save processed image to temp file
|
| 135 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
|
| 136 |
+
proc_tmp.write(processed_bytes)
|
| 137 |
+
temp_path = proc_tmp.name
|
| 138 |
+
|
| 139 |
+
# Get file size in MB
|
| 140 |
+
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
|
| 141 |
+
|
| 142 |
+
# Check if file exceeds API limits (50 MB)
|
| 143 |
+
if file_size_mb > 50:
|
| 144 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
|
| 145 |
+
return {
|
| 146 |
+
"file_name": uploaded_file.name,
|
| 147 |
+
"topics": ["Document"],
|
| 148 |
+
"languages": ["English"],
|
| 149 |
+
"confidence_score": 0.0,
|
| 150 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
| 151 |
+
"ocr_contents": {
|
| 152 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
| 153 |
+
"partial_text": "Document could not be processed due to size limitations."
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
# Update progress
|
| 158 |
+
progress_bar.progress(40)
|
| 159 |
+
status_text.text("Processing document with OCR...")
|
| 160 |
+
|
| 161 |
+
# Process the file with file size information for automatic page limiting
|
| 162 |
+
# Make sure we're using the latest mistral-ocr model
|
| 163 |
+
# See https://docs.mistral.ai/capabilities/document/ for more info
|
| 164 |
+
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
|
| 165 |
+
|
| 166 |
+
# Complete progress
|
| 167 |
+
progress_bar.progress(100)
|
| 168 |
+
status_text.empty()
|
| 169 |
+
|
| 170 |
+
return result
|
| 171 |
+
except Exception as e:
|
| 172 |
+
progress_bar.progress(100)
|
| 173 |
+
status_text.empty()
|
| 174 |
+
st.error(f"Error during processing: {str(e)}")
|
| 175 |
+
raise
|
| 176 |
+
finally:
|
| 177 |
+
# Clean up the temporary file
|
| 178 |
+
if os.path.exists(temp_path):
|
| 179 |
+
os.unlink(temp_path)
|
| 180 |
+
|
| 181 |
+
# App title and description
|
| 182 |
+
st.title("Historical Document OCR")
|
| 183 |
+
st.subheader("Powered by Mistral AI")
|
| 184 |
+
|
| 185 |
+
# Create main layout with tabs and columns
|
| 186 |
+
main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
|
| 187 |
+
|
| 188 |
+
with main_tab1:
|
| 189 |
+
# Create a two-column layout for file upload and preview
|
| 190 |
+
upload_col, preview_col = st.columns([1, 1])
|
| 191 |
+
|
| 192 |
+
# File uploader in the left column
|
| 193 |
+
with upload_col:
|
| 194 |
+
st.markdown("""
|
| 195 |
+
Upload an image or PDF file to get started.
|
| 196 |
+
|
| 197 |
+
Using the latest `mistral-ocr-latest` model for advanced document understanding.
|
| 198 |
+
""")
|
| 199 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
|
| 200 |
+
|
| 201 |
+
# Sidebar with options
|
| 202 |
+
with st.sidebar:
|
| 203 |
+
st.header("Options")
|
| 204 |
+
|
| 205 |
+
# Model options
|
| 206 |
+
st.subheader("Model Settings")
|
| 207 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
| 208 |
+
help="For image files, use the vision model for improved analysis (may be slower)")
|
| 209 |
+
|
| 210 |
+
# Image preprocessing options (collapsible)
|
| 211 |
+
st.subheader("Image Preprocessing")
|
| 212 |
+
with st.expander("Preprocessing Options"):
|
| 213 |
+
preprocessing_options = {}
|
| 214 |
+
preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
|
| 215 |
+
help="Convert image to grayscale before OCR")
|
| 216 |
+
preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
|
| 217 |
+
help="Apply adaptive thresholding to enhance text")
|
| 218 |
+
preprocessing_options["denoise"] = st.checkbox("Denoise Image",
|
| 219 |
+
help="Remove noise from the image")
|
| 220 |
+
preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
|
| 221 |
+
help="Adjust image contrast (-5 to +5)")
|
| 222 |
+
|
| 223 |
+
# PDF options (collapsible)
|
| 224 |
+
st.subheader("PDF Options")
|
| 225 |
+
with st.expander("PDF Settings"):
|
| 226 |
+
pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
|
| 227 |
+
help="Higher DPI gives better quality but slower processing")
|
| 228 |
+
max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
|
| 229 |
+
help="Limit number of pages to process")
|
| 230 |
+
|
| 231 |
+
# About tab content
|
| 232 |
+
with main_tab2:
|
| 233 |
+
st.markdown("""
|
| 234 |
+
### About This Application
|
| 235 |
+
|
| 236 |
+
This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
|
| 237 |
+
|
| 238 |
+
It can process:
|
| 239 |
+
- Image files (jpg, png, etc.)
|
| 240 |
+
- PDF documents (multi-page support)
|
| 241 |
+
|
| 242 |
+
The extracted content is processed into structured data based on the document type, combining:
|
| 243 |
+
- Text extraction with `mistral-ocr-latest`
|
| 244 |
+
- Analysis with language models
|
| 245 |
+
- Layout preservation with images
|
| 246 |
+
|
| 247 |
+
View results in three formats:
|
| 248 |
+
- Structured HTML view
|
| 249 |
+
- Raw JSON (for developers)
|
| 250 |
+
- Markdown with images (preserves document layout)
|
| 251 |
+
|
| 252 |
+
**New Features:**
|
| 253 |
+
- Image preprocessing for better OCR quality
|
| 254 |
+
- PDF resolution and page controls
|
| 255 |
+
- Progress tracking during processing
|
| 256 |
+
""")
|
| 257 |
+
|
| 258 |
+
with main_tab1:
|
| 259 |
+
if uploaded_file is not None:
|
| 260 |
+
# Check file size (cap at 50MB)
|
| 261 |
+
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
| 262 |
+
|
| 263 |
+
if file_size_mb > 50:
|
| 264 |
+
with upload_col:
|
| 265 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
|
| 266 |
+
st.stop()
|
| 267 |
+
|
| 268 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
| 269 |
+
|
| 270 |
+
# Display document preview in preview column
|
| 271 |
+
with preview_col:
|
| 272 |
+
st.subheader("Document Preview")
|
| 273 |
+
if file_ext == ".pdf":
|
| 274 |
+
try:
|
| 275 |
+
# Convert first page of PDF to image for preview
|
| 276 |
+
pdf_bytes = uploaded_file.getvalue()
|
| 277 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
| 278 |
+
|
| 279 |
+
if images:
|
| 280 |
+
# Convert PIL image to bytes for Streamlit
|
| 281 |
+
first_page = images[0]
|
| 282 |
+
img_bytes = io.BytesIO()
|
| 283 |
+
first_page.save(img_bytes, format='JPEG')
|
| 284 |
+
img_bytes.seek(0)
|
| 285 |
+
|
| 286 |
+
# Display the PDF preview
|
| 287 |
+
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
|
| 288 |
+
else:
|
| 289 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
| 290 |
+
except Exception:
|
| 291 |
+
# Simply show the file name without an error message
|
| 292 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
| 293 |
+
st.info("Click 'Process Document' to analyze the content.")
|
| 294 |
+
else:
|
| 295 |
+
st.image(uploaded_file, use_container_width=True)
|
| 296 |
+
|
| 297 |
+
# Add image preprocessing preview in a collapsible section if needed
|
| 298 |
+
if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
|
| 299 |
+
with st.expander("Image Preprocessing Preview"):
|
| 300 |
+
preview_cols = st.columns(2)
|
| 301 |
+
|
| 302 |
+
with preview_cols[0]:
|
| 303 |
+
st.markdown("**Original Image**")
|
| 304 |
+
st.image(uploaded_file, use_container_width=True)
|
| 305 |
+
|
| 306 |
+
with preview_cols[1]:
|
| 307 |
+
st.markdown("**Preprocessed Image**")
|
| 308 |
+
try:
|
| 309 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
| 310 |
+
st.image(io.BytesIO(processed_bytes), use_container_width=True)
|
| 311 |
+
except Exception as e:
|
| 312 |
+
st.error(f"Error in preprocessing: {str(e)}")
|
| 313 |
+
|
| 314 |
+
# Process button - flush left with similar padding as file browser
|
| 315 |
+
with upload_col:
|
| 316 |
+
process_button = st.button("Process Document", use_container_width=True)
|
| 317 |
+
|
| 318 |
+
# Results section
|
| 319 |
+
if process_button:
|
| 320 |
+
try:
|
| 321 |
+
# Get max_pages or default if not available
|
| 322 |
+
max_pages_value = max_pages if 'max_pages' in locals() else None
|
| 323 |
+
|
| 324 |
+
# Call process_file with all options
|
| 325 |
+
result = process_file(uploaded_file, use_vision, preprocessing_options)
|
| 326 |
+
|
| 327 |
+
# Create results tabs for better organization
|
| 328 |
+
results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
|
| 329 |
+
|
| 330 |
+
with results_tab1:
|
| 331 |
+
# Create two columns for metadata and content
|
| 332 |
+
meta_col, content_col = st.columns([1, 2])
|
| 333 |
+
|
| 334 |
+
with meta_col:
|
| 335 |
+
st.subheader("Document Metadata")
|
| 336 |
+
st.success("**Document processed successfully**")
|
| 337 |
+
|
| 338 |
+
# Display file info
|
| 339 |
+
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
| 340 |
+
|
| 341 |
+
# Display info if only limited pages were processed
|
| 342 |
+
if 'limited_pages' in result:
|
| 343 |
+
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
|
| 344 |
+
|
| 345 |
+
# Display languages if available
|
| 346 |
+
if 'languages' in result:
|
| 347 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
| 348 |
+
if languages:
|
| 349 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
| 350 |
+
|
| 351 |
+
# Confidence score if available
|
| 352 |
+
if 'confidence_score' in result:
|
| 353 |
+
confidence = result['confidence_score']
|
| 354 |
+
st.write(f"**OCR Confidence:** {confidence:.1%}")
|
| 355 |
+
|
| 356 |
+
# Display topics if available
|
| 357 |
+
if 'topics' in result and result['topics']:
|
| 358 |
+
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
| 359 |
+
|
| 360 |
+
with content_col:
|
| 361 |
+
st.subheader("Document Contents")
|
| 362 |
+
if 'ocr_contents' in result:
|
| 363 |
+
# Check if there are images in the OCR result
|
| 364 |
+
has_images = False
|
| 365 |
+
if 'raw_response' in result:
|
| 366 |
+
try:
|
| 367 |
+
has_images = any(page.images for page in result['raw_response'].pages)
|
| 368 |
+
except Exception:
|
| 369 |
+
has_images = False
|
| 370 |
+
|
| 371 |
+
# Create tabs for different views
|
| 372 |
+
if has_images:
|
| 373 |
+
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
|
| 374 |
+
else:
|
| 375 |
+
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
|
| 376 |
+
|
| 377 |
+
with view_tab1:
|
| 378 |
+
# Display in a more user-friendly format based on the content structure
|
| 379 |
+
html_content = ""
|
| 380 |
+
if isinstance(result['ocr_contents'], dict):
|
| 381 |
+
for section, content in result['ocr_contents'].items():
|
| 382 |
+
if content: # Only display non-empty sections
|
| 383 |
+
section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
|
| 384 |
+
html_content += section_title
|
| 385 |
+
|
| 386 |
+
if isinstance(content, str):
|
| 387 |
+
html_content += f"<p>{content}</p>"
|
| 388 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 389 |
+
st.markdown(content)
|
| 390 |
+
elif isinstance(content, list):
|
| 391 |
+
html_list = "<ul>"
|
| 392 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 393 |
+
for item in content:
|
| 394 |
+
if isinstance(item, str):
|
| 395 |
+
html_list += f"<li>{item}</li>"
|
| 396 |
+
st.markdown(f"- {item}")
|
| 397 |
+
elif isinstance(item, dict):
|
| 398 |
+
html_list += f"<li>{json.dumps(item)}</li>"
|
| 399 |
+
st.json(item)
|
| 400 |
+
html_list += "</ul>"
|
| 401 |
+
html_content += html_list
|
| 402 |
+
elif isinstance(content, dict):
|
| 403 |
+
html_dict = "<dl>"
|
| 404 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
| 405 |
+
for k, v in content.items():
|
| 406 |
+
html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
|
| 407 |
+
st.markdown(f"**{k}:** {v}")
|
| 408 |
+
html_dict += "</dl>"
|
| 409 |
+
html_content += html_dict
|
| 410 |
+
|
| 411 |
+
# Add download button in a smaller section
|
| 412 |
+
with st.expander("Export Content"):
|
| 413 |
+
# Alternative download button
|
| 414 |
+
html_bytes = html_content.encode()
|
| 415 |
+
st.download_button(
|
| 416 |
+
label="Download as HTML",
|
| 417 |
+
data=html_bytes,
|
| 418 |
+
file_name="document_content.html",
|
| 419 |
+
mime="text/html"
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
with view_tab2:
|
| 423 |
+
# Show the raw JSON for developers
|
| 424 |
+
st.json(result)
|
| 425 |
+
|
| 426 |
+
if has_images:
|
| 427 |
+
with view_tab3:
|
| 428 |
+
# Show loading indicator while preparing images
|
| 429 |
+
with st.spinner("Preparing document with embedded images..."):
|
| 430 |
+
try:
|
| 431 |
+
# Import function
|
| 432 |
+
try:
|
| 433 |
+
from ocr_utils import get_combined_markdown
|
| 434 |
+
except ImportError:
|
| 435 |
+
st.error("Required module ocr_utils not found.")
|
| 436 |
+
st.stop()
|
| 437 |
+
|
| 438 |
+
# Check if raw_response is available
|
| 439 |
+
if 'raw_response' not in result:
|
| 440 |
+
st.warning("Raw OCR response not available. Cannot display images.")
|
| 441 |
+
st.stop()
|
| 442 |
+
|
| 443 |
+
# Validate the raw_response structure before processing
|
| 444 |
+
if not hasattr(result['raw_response'], 'pages'):
|
| 445 |
+
st.warning("Invalid OCR response format. Cannot display images.")
|
| 446 |
+
st.stop()
|
| 447 |
+
|
| 448 |
+
# Get the combined markdown with images
|
| 449 |
+
combined_markdown = get_combined_markdown(result['raw_response'])
|
| 450 |
+
|
| 451 |
+
if not combined_markdown or combined_markdown.strip() == "":
|
| 452 |
+
st.warning("No image content found in the document.")
|
| 453 |
+
st.stop()
|
| 454 |
+
|
| 455 |
+
# Add CSS to ensure proper spacing and handling of text and images
|
| 456 |
+
st.markdown("""
|
| 457 |
+
<style>
|
| 458 |
+
.markdown-text-container {
|
| 459 |
+
padding: 10px;
|
| 460 |
+
background-color: #f9f9f9;
|
| 461 |
+
border-radius: 5px;
|
| 462 |
+
}
|
| 463 |
+
.markdown-text-container img {
|
| 464 |
+
margin: 15px 0;
|
| 465 |
+
max-width: 100%;
|
| 466 |
+
border: 1px solid #ddd;
|
| 467 |
+
border-radius: 4px;
|
| 468 |
+
display: block;
|
| 469 |
+
}
|
| 470 |
+
.markdown-text-container p {
|
| 471 |
+
margin-bottom: 16px;
|
| 472 |
+
line-height: 1.6;
|
| 473 |
+
}
|
| 474 |
+
</style>
|
| 475 |
+
""", unsafe_allow_html=True)
|
| 476 |
+
|
| 477 |
+
# Wrap the markdown in a div with the class for styling
|
| 478 |
+
st.markdown(f"""
|
| 479 |
+
<div class="markdown-text-container">
|
| 480 |
+
{combined_markdown}
|
| 481 |
+
</div>
|
| 482 |
+
""", unsafe_allow_html=True)
|
| 483 |
+
|
| 484 |
+
# Add a download button for the combined content
|
| 485 |
+
st.download_button(
|
| 486 |
+
label="Download with Images (HTML)",
|
| 487 |
+
data=f"""
|
| 488 |
+
<html>
|
| 489 |
+
<head>
|
| 490 |
+
<style>
|
| 491 |
+
body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
|
| 492 |
+
img {{ max-width: 100%; margin: 15px 0; }}
|
| 493 |
+
</style>
|
| 494 |
+
</head>
|
| 495 |
+
<body>
|
| 496 |
+
{combined_markdown}
|
| 497 |
+
</body>
|
| 498 |
+
</html>
|
| 499 |
+
""",
|
| 500 |
+
file_name="document_with_images.html",
|
| 501 |
+
mime="text/html"
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
except Exception as e:
|
| 505 |
+
st.error(f"Could not display document with images: {str(e)}")
|
| 506 |
+
st.info("Try refreshing or processing the document again.")
|
| 507 |
+
else:
|
| 508 |
+
st.error("No OCR content was extracted from the document.")
|
| 509 |
+
|
| 510 |
+
with results_tab2:
|
| 511 |
+
st.subheader("Raw Processing Results")
|
| 512 |
+
st.json(result)
|
| 513 |
+
|
| 514 |
+
except Exception as e:
|
| 515 |
+
st.error(f"Error processing document: {str(e)}")
|
| 516 |
+
else:
|
| 517 |
+
# Display sample images in the main area when no file is uploaded
|
| 518 |
+
st.info("Upload a document to get started using the file uploader above.")
|
| 519 |
+
|
| 520 |
+
# Show example images in a grid
|
| 521 |
+
st.subheader("Example Documents")
|
| 522 |
+
|
| 523 |
+
# Add a sample images container
|
| 524 |
+
with st.container():
|
| 525 |
+
# Find sample images from the input directory to display
|
| 526 |
+
input_dir = Path(__file__).parent / "input"
|
| 527 |
+
sample_images = []
|
| 528 |
+
if input_dir.exists():
|
| 529 |
+
sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
|
| 530 |
+
|
| 531 |
+
if sample_images:
|
| 532 |
+
columns = st.columns(3)
|
| 533 |
+
for i, img_path in enumerate(sample_images):
|
| 534 |
+
with columns[i % 3]:
|
| 535 |
+
st.image(str(img_path), caption=img_path.name, use_container_width=True)
|
backup/config.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config.py
|
| 2 |
+
"""
|
| 3 |
+
Configuration file for Mistral OCR processing.
|
| 4 |
+
Contains API key and other settings.
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# Your Mistral API key - get from Hugging Face secrets or environment variable
|
| 9 |
+
# The priority order is: HF_SPACES environment var > regular environment var > empty string
|
| 10 |
+
# Note: No default API key is provided for security reasons
|
| 11 |
+
MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
|
| 12 |
+
os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
|
| 13 |
+
|
| 14 |
+
# Model settings
|
| 15 |
+
OCR_MODEL = "mistral-ocr-latest"
|
| 16 |
+
TEXT_MODEL = "ministral-8b-latest"
|
| 17 |
+
VISION_MODEL = "pixtral-12b-latest"
|
input/magician-or-bottle-cungerer.jpg → backup/input/The Magician, or Bottle Cungerer.jpeg
RENAMED
|
File without changes
|
input/baldwin-15th-north.jpg → backup/input/baldwin-letter-1.jpg
RENAMED
|
File without changes
|
input/americae-retectio.jpg → backup/input/baldwin-letter-2.jpg
RENAMED
|
File without changes
|
backup/input/flier.png
ADDED
|
input/baldwin-letter.jpg → backup/input/letter-1.jpg
RENAMED
|
File without changes
|
input/gender.jpg → backup/input/letter-2.jpg
RENAMED
|
File without changes
|
input/photo-baldwin-letter.jpg → backup/input/letter-3.jpg
RENAMED
|
File without changes
|
backup/input/magellan-travels.jpg
ADDED
|
Git LFS Details
|
input/handwritten-journal.jpg → backup/input/menu.pdf
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
| 3 |
+
size 2554815
|
backup/input/recipe.jpg
ADDED
|
backup/ocr_utils.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for OCR processing with Mistral AI.
|
| 3 |
+
Contains helper functions for working with OCR responses and image handling.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import base64
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List, Optional, Union
|
| 10 |
+
|
| 11 |
+
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
| 12 |
+
|
| 13 |
+
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
|
| 14 |
+
"""
|
| 15 |
+
Replace image placeholders in markdown with base64-encoded images.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
markdown_str: Markdown text containing image placeholders
|
| 19 |
+
images_dict: Dictionary mapping image IDs to base64 strings
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Markdown text with images replaced by base64 data
|
| 23 |
+
"""
|
| 24 |
+
for img_name, base64_str in images_dict.items():
|
| 25 |
+
markdown_str = markdown_str.replace(
|
| 26 |
+
f"", f""
|
| 27 |
+
)
|
| 28 |
+
return markdown_str
|
| 29 |
+
|
| 30 |
+
def get_combined_markdown(ocr_response) -> str:
|
| 31 |
+
"""
|
| 32 |
+
Combine OCR text and images into a single markdown document.
|
| 33 |
+
Ensures proper spacing between text and images.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
ocr_response: Response from OCR processing containing text and images
|
| 37 |
+
See https://docs.mistral.ai/capabilities/document/ for API reference
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Combined markdown string with embedded images
|
| 41 |
+
"""
|
| 42 |
+
markdowns: list[str] = []
|
| 43 |
+
# Extract images from page
|
| 44 |
+
for page in ocr_response.pages:
|
| 45 |
+
image_data = {}
|
| 46 |
+
for img in page.images:
|
| 47 |
+
image_data[img.id] = img.image_base64
|
| 48 |
+
|
| 49 |
+
# Replace image placeholders with actual images
|
| 50 |
+
page_markdown = replace_images_in_markdown(page.markdown, image_data)
|
| 51 |
+
|
| 52 |
+
# Ensure proper spacing between paragraphs and images
|
| 53 |
+
# Add extra newlines between paragraphs to improve rendering
|
| 54 |
+
page_markdown = page_markdown.replace("\n", "\n\n")
|
| 55 |
+
|
| 56 |
+
# Add page separator for multi-page documents
|
| 57 |
+
markdowns.append(page_markdown)
|
| 58 |
+
|
| 59 |
+
# Join pages with clear separators for multi-page documents
|
| 60 |
+
return "\n\n---\n\n".join(markdowns)
|
| 61 |
+
|
| 62 |
+
def encode_image_for_api(image_path: Union[str, Path]) -> str:
|
| 63 |
+
"""
|
| 64 |
+
Encode an image as base64 for API use.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
image_path: Path to the image file
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Base64 data URL for the image
|
| 71 |
+
"""
|
| 72 |
+
# Convert to Path object if string
|
| 73 |
+
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
| 74 |
+
|
| 75 |
+
# Verify image exists
|
| 76 |
+
if not image_file.is_file():
|
| 77 |
+
raise FileNotFoundError(f"Image file not found: {image_file}")
|
| 78 |
+
|
| 79 |
+
# Encode image as base64
|
| 80 |
+
encoded = base64.b64encode(image_file.read_bytes()).decode()
|
| 81 |
+
return f"data:image/jpeg;base64,{encoded}"
|
| 82 |
+
|
| 83 |
+
def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
|
| 84 |
+
"""
|
| 85 |
+
Process an image with OCR and return the response.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
client: Mistral AI client
|
| 89 |
+
image_path: Path to the image file
|
| 90 |
+
model: OCR model to use
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
OCR response object
|
| 94 |
+
"""
|
| 95 |
+
# Encode image as base64
|
| 96 |
+
base64_data_url = encode_image_for_api(image_path)
|
| 97 |
+
|
| 98 |
+
# Process image with OCR
|
| 99 |
+
image_response = client.ocr.process(
|
| 100 |
+
document=ImageURLChunk(image_url=base64_data_url),
|
| 101 |
+
model=model
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
return image_response
|
| 105 |
+
|
| 106 |
+
def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
|
| 107 |
+
"""
|
| 108 |
+
Convert OCR response to a formatted JSON string.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
ocr_response: OCR response object
|
| 112 |
+
indent: Indentation level for JSON formatting
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
Formatted JSON string
|
| 116 |
+
"""
|
| 117 |
+
# Convert response to JSON
|
| 118 |
+
response_dict = json.loads(ocr_response.model_dump_json())
|
| 119 |
+
return json.dumps(response_dict, indent=indent)
|
| 120 |
+
|
| 121 |
+
# For display in notebooks
|
| 122 |
+
try:
|
| 123 |
+
from IPython.display import Markdown, display
|
| 124 |
+
|
| 125 |
+
def display_ocr_with_images(ocr_response):
|
| 126 |
+
"""
|
| 127 |
+
Display OCR response with embedded images in IPython environments.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
ocr_response: OCR response object
|
| 131 |
+
"""
|
| 132 |
+
combined_markdown = get_combined_markdown(ocr_response)
|
| 133 |
+
display(Markdown(combined_markdown))
|
| 134 |
+
except ImportError:
|
| 135 |
+
# IPython not available
|
| 136 |
+
pass
|
backup/pdf_ocr.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
PDFOCR - Module for processing PDF files with OCR and extracting structured data.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from structured_ocr import StructuredOCR
|
| 9 |
+
|
| 10 |
+
class PDFOCR:
|
| 11 |
+
"""Class for processing PDF files with OCR and extracting structured data."""
|
| 12 |
+
|
| 13 |
+
def __init__(self, api_key=None):
|
| 14 |
+
"""Initialize the PDF OCR processor."""
|
| 15 |
+
self.processor = StructuredOCR(api_key=api_key)
|
| 16 |
+
|
| 17 |
+
def process_pdf(self, pdf_path, use_vision=True):
|
| 18 |
+
"""
|
| 19 |
+
Process a PDF file with OCR and extract structured data.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
pdf_path: Path to the PDF file
|
| 23 |
+
use_vision: Whether to use vision model for improved analysis
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Dictionary with structured OCR results
|
| 27 |
+
"""
|
| 28 |
+
pdf_path = Path(pdf_path)
|
| 29 |
+
if not pdf_path.exists():
|
| 30 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
| 31 |
+
|
| 32 |
+
return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
|
| 33 |
+
|
| 34 |
+
def save_json_output(self, pdf_path, output_path, use_vision=True):
|
| 35 |
+
"""
|
| 36 |
+
Process a PDF file and save the structured output as JSON.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
pdf_path: Path to the PDF file
|
| 40 |
+
output_path: Path where to save the JSON output
|
| 41 |
+
use_vision: Whether to use vision model for improved analysis
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Path to the saved JSON file
|
| 45 |
+
"""
|
| 46 |
+
# Process the PDF
|
| 47 |
+
result = self.process_pdf(pdf_path, use_vision=use_vision)
|
| 48 |
+
|
| 49 |
+
# Save the result to JSON
|
| 50 |
+
output_path = Path(output_path)
|
| 51 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
with open(output_path, 'w') as f:
|
| 54 |
+
json.dump(result, f, indent=2)
|
| 55 |
+
|
| 56 |
+
return output_path
|
| 57 |
+
|
| 58 |
+
# For testing directly
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
import sys
|
| 61 |
+
|
| 62 |
+
if len(sys.argv) < 2:
|
| 63 |
+
print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
|
| 64 |
+
sys.exit(1)
|
| 65 |
+
|
| 66 |
+
pdf_path = sys.argv[1]
|
| 67 |
+
output_path = sys.argv[2] if len(sys.argv) > 2 else None
|
| 68 |
+
|
| 69 |
+
processor = PDFOCR()
|
| 70 |
+
|
| 71 |
+
if output_path:
|
| 72 |
+
result_path = processor.save_json_output(pdf_path, output_path)
|
| 73 |
+
print(f"Results saved to: {result_path}")
|
| 74 |
+
else:
|
| 75 |
+
result = processor.process_pdf(pdf_path)
|
| 76 |
+
print(json.dumps(result, indent=2))
|
backup/requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.43.2
|
| 2 |
+
mistralai>=0.0.7
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
pycountry>=23.12.11
|
| 5 |
+
pillow>=10.0.0
|
| 6 |
+
python-multipart>=0.0.6
|
| 7 |
+
pdf2image>=1.17.0
|
| 8 |
+
pytesseract>=0.3.10
|
| 9 |
+
opencv-python-headless>=4.6.0
|
| 10 |
+
numpy>=1.23.5
|
backup/structured_ocr.py
ADDED
|
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import json
|
| 7 |
+
import base64
|
| 8 |
+
import pycountry
|
| 9 |
+
import logging
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
from mistralai import Mistral
|
| 12 |
+
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO,
|
| 16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 17 |
+
|
| 18 |
+
# Import utilities for OCR processing
|
| 19 |
+
try:
|
| 20 |
+
from ocr_utils import replace_images_in_markdown, get_combined_markdown
|
| 21 |
+
except ImportError:
|
| 22 |
+
# Define fallback functions if module not found
|
| 23 |
+
def replace_images_in_markdown(markdown_str, images_dict):
|
| 24 |
+
for img_name, base64_str in images_dict.items():
|
| 25 |
+
markdown_str = markdown_str.replace(
|
| 26 |
+
f"", f""
|
| 27 |
+
)
|
| 28 |
+
return markdown_str
|
| 29 |
+
|
| 30 |
+
def get_combined_markdown(ocr_response):
|
| 31 |
+
markdowns = []
|
| 32 |
+
for page in ocr_response.pages:
|
| 33 |
+
image_data = {}
|
| 34 |
+
for img in page.images:
|
| 35 |
+
image_data[img.id] = img.image_base64
|
| 36 |
+
markdowns.append(replace_images_in_markdown(page.markdown, image_data))
|
| 37 |
+
return "\n\n".join(markdowns)
|
| 38 |
+
|
| 39 |
+
# Import config directly (now local to historical-ocr)
|
| 40 |
+
from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
|
| 41 |
+
|
| 42 |
+
# Create language enum for structured output
|
| 43 |
+
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
| 44 |
+
|
| 45 |
+
class LanguageMeta(Enum.__class__):
|
| 46 |
+
def __new__(metacls, cls, bases, classdict):
|
| 47 |
+
for code, name in languages.items():
|
| 48 |
+
classdict[name.upper().replace(' ', '_')] = name
|
| 49 |
+
return super().__new__(metacls, cls, bases, classdict)
|
| 50 |
+
|
| 51 |
+
class Language(Enum, metaclass=LanguageMeta):
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
class StructuredOCRModel(BaseModel):
|
| 55 |
+
file_name: str
|
| 56 |
+
topics: list[str]
|
| 57 |
+
languages: list[Language]
|
| 58 |
+
ocr_contents: dict
|
| 59 |
+
|
| 60 |
+
class StructuredOCR:
|
| 61 |
+
def __init__(self, api_key=None):
|
| 62 |
+
"""Initialize the OCR processor with API key"""
|
| 63 |
+
self.api_key = api_key or MISTRAL_API_KEY
|
| 64 |
+
self.client = Mistral(api_key=self.api_key)
|
| 65 |
+
|
| 66 |
+
def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
|
| 67 |
+
"""Process a file and return structured OCR results
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
file_path: Path to the file to process
|
| 71 |
+
file_type: 'pdf' or 'image' (will be auto-detected if None)
|
| 72 |
+
use_vision: Whether to use vision model for improved analysis
|
| 73 |
+
max_pages: Optional limit on number of pages to process
|
| 74 |
+
file_size_mb: Optional file size in MB (used for automatic page limiting)
|
| 75 |
+
custom_pages: Optional list of specific page numbers to process
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Dictionary with structured OCR results
|
| 79 |
+
"""
|
| 80 |
+
# Convert file_path to Path object if it's a string
|
| 81 |
+
file_path = Path(file_path)
|
| 82 |
+
|
| 83 |
+
# Auto-detect file type if not provided
|
| 84 |
+
if file_type is None:
|
| 85 |
+
suffix = file_path.suffix.lower()
|
| 86 |
+
file_type = "pdf" if suffix == ".pdf" else "image"
|
| 87 |
+
|
| 88 |
+
# Get file size if not provided
|
| 89 |
+
if file_size_mb is None and file_path.exists():
|
| 90 |
+
file_size_mb = file_path.stat().st_size / (1024 * 1024) # Convert bytes to MB
|
| 91 |
+
|
| 92 |
+
# Check if file exceeds API limits (50 MB)
|
| 93 |
+
if file_size_mb and file_size_mb > 50:
|
| 94 |
+
logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
|
| 95 |
+
return {
|
| 96 |
+
"file_name": file_path.name,
|
| 97 |
+
"topics": ["Document"],
|
| 98 |
+
"languages": ["English"],
|
| 99 |
+
"confidence_score": 0.0,
|
| 100 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
|
| 101 |
+
"ocr_contents": {
|
| 102 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
| 103 |
+
"partial_text": "Document could not be processed due to size limitations."
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
# For PDF files, limit pages based on file size if no explicit limit is given
|
| 108 |
+
if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
|
| 109 |
+
if file_size_mb > 100: # Very large files
|
| 110 |
+
max_pages = 3
|
| 111 |
+
elif file_size_mb > 50: # Large files
|
| 112 |
+
max_pages = 5
|
| 113 |
+
elif file_size_mb > 20: # Medium files
|
| 114 |
+
max_pages = 10
|
| 115 |
+
else: # Small files
|
| 116 |
+
max_pages = None # Process all pages
|
| 117 |
+
|
| 118 |
+
# Start processing timer
|
| 119 |
+
start_time = time.time()
|
| 120 |
+
|
| 121 |
+
# Read and process the file
|
| 122 |
+
if file_type == "pdf":
|
| 123 |
+
result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
|
| 124 |
+
else:
|
| 125 |
+
result = self._process_image(file_path, use_vision)
|
| 126 |
+
|
| 127 |
+
# Add processing time information
|
| 128 |
+
processing_time = time.time() - start_time
|
| 129 |
+
result['processing_time'] = processing_time
|
| 130 |
+
|
| 131 |
+
# Add a default confidence score if not present
|
| 132 |
+
if 'confidence_score' not in result:
|
| 133 |
+
result['confidence_score'] = 0.85 # Default confidence
|
| 134 |
+
|
| 135 |
+
return result
|
| 136 |
+
|
| 137 |
+
def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
|
| 138 |
+
"""Process a PDF file with OCR
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
file_path: Path to the PDF file
|
| 142 |
+
use_vision: Whether to use vision model
|
| 143 |
+
max_pages: Optional limit on the number of pages to process
|
| 144 |
+
custom_pages: Optional list of specific page numbers to process
|
| 145 |
+
"""
|
| 146 |
+
logger = logging.getLogger("pdf_processor")
|
| 147 |
+
logger.info(f"Processing PDF: {file_path}")
|
| 148 |
+
|
| 149 |
+
try:
|
| 150 |
+
# Upload the PDF file
|
| 151 |
+
logger.info("Uploading PDF file to Mistral API")
|
| 152 |
+
uploaded_file = self.client.files.upload(
|
| 153 |
+
file={
|
| 154 |
+
"file_name": file_path.stem,
|
| 155 |
+
"content": file_path.read_bytes(),
|
| 156 |
+
},
|
| 157 |
+
purpose="ocr",
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Get a signed URL for the uploaded file
|
| 161 |
+
signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
| 162 |
+
|
| 163 |
+
# Process the PDF with OCR
|
| 164 |
+
logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
|
| 165 |
+
pdf_response = self.client.ocr.process(
|
| 166 |
+
document=DocumentURLChunk(document_url=signed_url.url),
|
| 167 |
+
model=OCR_MODEL,
|
| 168 |
+
include_image_base64=True
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Limit pages if requested
|
| 172 |
+
pages_to_process = pdf_response.pages
|
| 173 |
+
total_pages = len(pdf_response.pages)
|
| 174 |
+
limited_pages = False
|
| 175 |
+
|
| 176 |
+
logger.info(f"PDF has {total_pages} total pages")
|
| 177 |
+
|
| 178 |
+
# Handle custom page selection if provided
|
| 179 |
+
if custom_pages:
|
| 180 |
+
# Convert to 0-based indexing and filter valid page numbers
|
| 181 |
+
valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
|
| 182 |
+
if valid_indices:
|
| 183 |
+
pages_to_process = [pdf_response.pages[i] for i in valid_indices]
|
| 184 |
+
limited_pages = True
|
| 185 |
+
logger.info(f"Processing {len(valid_indices)} custom-selected pages")
|
| 186 |
+
# Otherwise handle max_pages limit
|
| 187 |
+
elif max_pages and total_pages > max_pages:
|
| 188 |
+
pages_to_process = pages_to_process[:max_pages]
|
| 189 |
+
limited_pages = True
|
| 190 |
+
logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
|
| 191 |
+
|
| 192 |
+
# Calculate average confidence score based on OCR response if available
|
| 193 |
+
confidence_score = 0.0
|
| 194 |
+
try:
|
| 195 |
+
# Some OCR APIs provide confidence scores
|
| 196 |
+
confidence_values = []
|
| 197 |
+
for page in pages_to_process:
|
| 198 |
+
if hasattr(page, 'confidence'):
|
| 199 |
+
confidence_values.append(page.confidence)
|
| 200 |
+
|
| 201 |
+
if confidence_values:
|
| 202 |
+
confidence_score = sum(confidence_values) / len(confidence_values)
|
| 203 |
+
else:
|
| 204 |
+
confidence_score = 0.85 # Default if no confidence scores available
|
| 205 |
+
except:
|
| 206 |
+
confidence_score = 0.85 # Default fallback
|
| 207 |
+
|
| 208 |
+
# Combine pages' markdown into a single string
|
| 209 |
+
all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
|
| 210 |
+
|
| 211 |
+
# Extract structured data using the appropriate model
|
| 212 |
+
if use_vision:
|
| 213 |
+
# Get base64 of first page for vision model
|
| 214 |
+
first_page_image = None
|
| 215 |
+
if pages_to_process and pages_to_process[0].images:
|
| 216 |
+
first_page_image = pages_to_process[0].images[0].image_base64
|
| 217 |
+
|
| 218 |
+
if first_page_image:
|
| 219 |
+
# Use vision model
|
| 220 |
+
logger.info(f"Using vision model: {VISION_MODEL}")
|
| 221 |
+
result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
|
| 222 |
+
else:
|
| 223 |
+
# Fall back to text-only model if no image available
|
| 224 |
+
logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
|
| 225 |
+
result = self._extract_structured_data_text_only(all_markdown, file_path.name)
|
| 226 |
+
else:
|
| 227 |
+
# Use text-only model
|
| 228 |
+
logger.info(f"Using text-only model: {TEXT_MODEL}")
|
| 229 |
+
result = self._extract_structured_data_text_only(all_markdown, file_path.name)
|
| 230 |
+
|
| 231 |
+
# Add page limit info to result if needed
|
| 232 |
+
if limited_pages:
|
| 233 |
+
result['limited_pages'] = {
|
| 234 |
+
'processed': len(pages_to_process),
|
| 235 |
+
'total': total_pages
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
# Add confidence score
|
| 239 |
+
result['confidence_score'] = confidence_score
|
| 240 |
+
|
| 241 |
+
# Store the raw OCR response for image rendering
|
| 242 |
+
result['raw_response'] = pdf_response
|
| 243 |
+
|
| 244 |
+
logger.info(f"PDF processing completed successfully")
|
| 245 |
+
return result
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
| 249 |
+
# Return basic result on error
|
| 250 |
+
return {
|
| 251 |
+
"file_name": file_path.name,
|
| 252 |
+
"topics": ["Document"],
|
| 253 |
+
"languages": ["English"],
|
| 254 |
+
"confidence_score": 0.0,
|
| 255 |
+
"error": str(e),
|
| 256 |
+
"ocr_contents": {
|
| 257 |
+
"error": f"Failed to process PDF: {str(e)}",
|
| 258 |
+
"partial_text": "Document could not be fully processed."
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
def _process_image(self, file_path, use_vision=True):
|
| 263 |
+
"""Process an image file with OCR"""
|
| 264 |
+
logger = logging.getLogger("image_processor")
|
| 265 |
+
logger.info(f"Processing image: {file_path}")
|
| 266 |
+
|
| 267 |
+
try:
|
| 268 |
+
# Read and encode the image file
|
| 269 |
+
logger.info("Encoding image for API")
|
| 270 |
+
encoded_image = base64.b64encode(file_path.read_bytes()).decode()
|
| 271 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
| 272 |
+
|
| 273 |
+
# Process the image with OCR
|
| 274 |
+
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
| 275 |
+
image_response = self.client.ocr.process(
|
| 276 |
+
document=ImageURLChunk(image_url=base64_data_url),
|
| 277 |
+
model=OCR_MODEL,
|
| 278 |
+
include_image_base64=True
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
# Get the OCR markdown from the first page
|
| 282 |
+
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
| 283 |
+
|
| 284 |
+
# Calculate confidence score if available
|
| 285 |
+
confidence_score = 0.85 # Default value
|
| 286 |
+
try:
|
| 287 |
+
if hasattr(image_response.pages[0], 'confidence'):
|
| 288 |
+
confidence_score = image_response.pages[0].confidence
|
| 289 |
+
except:
|
| 290 |
+
pass
|
| 291 |
+
|
| 292 |
+
# Extract structured data using the appropriate model
|
| 293 |
+
if use_vision:
|
| 294 |
+
logger.info(f"Using vision model: {VISION_MODEL}")
|
| 295 |
+
result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
|
| 296 |
+
else:
|
| 297 |
+
logger.info(f"Using text-only model: {TEXT_MODEL}")
|
| 298 |
+
result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
|
| 299 |
+
|
| 300 |
+
# Add confidence score
|
| 301 |
+
result['confidence_score'] = confidence_score
|
| 302 |
+
|
| 303 |
+
# Store the raw OCR response for image rendering
|
| 304 |
+
result['raw_response'] = image_response
|
| 305 |
+
|
| 306 |
+
logger.info("Image processing completed successfully")
|
| 307 |
+
return result
|
| 308 |
+
|
| 309 |
+
except Exception as e:
|
| 310 |
+
logger.error(f"Error processing image: {str(e)}")
|
| 311 |
+
# Return basic result on error
|
| 312 |
+
return {
|
| 313 |
+
"file_name": file_path.name,
|
| 314 |
+
"topics": ["Document"],
|
| 315 |
+
"languages": ["English"],
|
| 316 |
+
"confidence_score": 0.0,
|
| 317 |
+
"error": str(e),
|
| 318 |
+
"ocr_contents": {
|
| 319 |
+
"error": f"Failed to process image: {str(e)}",
|
| 320 |
+
"partial_text": "Image could not be processed."
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
|
| 325 |
+
"""Extract structured data using vision model"""
|
| 326 |
+
try:
|
| 327 |
+
# Parse with vision model with a timeout
|
| 328 |
+
chat_response = self.client.chat.parse(
|
| 329 |
+
model=VISION_MODEL,
|
| 330 |
+
messages=[
|
| 331 |
+
{
|
| 332 |
+
"role": "user",
|
| 333 |
+
"content": [
|
| 334 |
+
ImageURLChunk(image_url=image_base64),
|
| 335 |
+
TextChunk(text=(
|
| 336 |
+
f"This is a historical document's OCR in markdown:\n"
|
| 337 |
+
f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
| 338 |
+
f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
|
| 339 |
+
f"Extract topics, languages, and organize the content logically."
|
| 340 |
+
))
|
| 341 |
+
],
|
| 342 |
+
},
|
| 343 |
+
],
|
| 344 |
+
response_format=StructuredOCRModel,
|
| 345 |
+
temperature=0
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
# Convert the response to a dictionary
|
| 349 |
+
result = json.loads(chat_response.choices[0].message.parsed.json())
|
| 350 |
+
|
| 351 |
+
# Ensure languages is a list of strings, not Language enum objects
|
| 352 |
+
if 'languages' in result:
|
| 353 |
+
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
| 354 |
+
|
| 355 |
+
except Exception as e:
|
| 356 |
+
# Fall back to text-only model if vision model fails
|
| 357 |
+
print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
|
| 358 |
+
result = self._extract_structured_data_text_only(ocr_markdown, filename)
|
| 359 |
+
|
| 360 |
+
return result
|
| 361 |
+
|
| 362 |
+
def _extract_structured_data_text_only(self, ocr_markdown, filename):
|
| 363 |
+
"""Extract structured data using text-only model"""
|
| 364 |
+
try:
|
| 365 |
+
# Parse with text-only model with a timeout
|
| 366 |
+
chat_response = self.client.chat.parse(
|
| 367 |
+
model=TEXT_MODEL,
|
| 368 |
+
messages=[
|
| 369 |
+
{
|
| 370 |
+
"role": "user",
|
| 371 |
+
"content": f"This is a historical document's OCR in markdown:\n"
|
| 372 |
+
f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
| 373 |
+
f"Convert this into a structured JSON response with the OCR contents. "
|
| 374 |
+
f"Extract topics, languages, and organize the content logically."
|
| 375 |
+
},
|
| 376 |
+
],
|
| 377 |
+
response_format=StructuredOCRModel,
|
| 378 |
+
temperature=0
|
| 379 |
+
)
|
| 380 |
+
|
| 381 |
+
# Convert the response to a dictionary
|
| 382 |
+
result = json.loads(chat_response.choices[0].message.parsed.json())
|
| 383 |
+
|
| 384 |
+
# Ensure languages is a list of strings, not Language enum objects
|
| 385 |
+
if 'languages' in result:
|
| 386 |
+
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
| 387 |
+
|
| 388 |
+
except Exception as e:
|
| 389 |
+
# Create a basic result if parsing fails
|
| 390 |
+
print(f"Text model failed: {str(e)}. Creating basic result.")
|
| 391 |
+
result = {
|
| 392 |
+
"file_name": filename,
|
| 393 |
+
"topics": ["Document"],
|
| 394 |
+
"languages": ["English"],
|
| 395 |
+
"ocr_contents": {
|
| 396 |
+
"raw_text": ocr_markdown
|
| 397 |
+
}
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
return result
|
| 401 |
+
|
| 402 |
+
# For testing directly
|
| 403 |
+
if __name__ == "__main__":
|
| 404 |
+
import sys
|
| 405 |
+
|
| 406 |
+
if len(sys.argv) < 2:
|
| 407 |
+
print("Usage: python structured_ocr.py <file_path>")
|
| 408 |
+
sys.exit(1)
|
| 409 |
+
|
| 410 |
+
file_path = sys.argv[1]
|
| 411 |
+
processor = StructuredOCR()
|
| 412 |
+
result = processor.process_file(file_path)
|
| 413 |
+
|
| 414 |
+
print(json.dumps(result, indent=2))
|
config.py
CHANGED
|
@@ -4,64 +4,14 @@ Configuration file for Mistral OCR processing.
|
|
| 4 |
Contains API key and other settings.
|
| 5 |
"""
|
| 6 |
import os
|
| 7 |
-
import logging
|
| 8 |
-
from dotenv import load_dotenv
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
# Priority order:
|
| 18 |
-
# 1. HF_API_KEY environment variable (Hugging Face standard)
|
| 19 |
-
# 2. HUGGING_FACE_API_KEY environment variable (alternative name)
|
| 20 |
-
# 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
|
| 21 |
-
# 4. MISTRAL_API_KEY environment variable (fallback)
|
| 22 |
-
# 5. Empty string (will show warning in app)
|
| 23 |
-
|
| 24 |
-
MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
|
| 25 |
-
os.environ.get("HUGGING_FACE_API_KEY",
|
| 26 |
-
os.environ.get("HF_MISTRAL_API_KEY",
|
| 27 |
-
os.environ.get("MISTRAL_API_KEY", "")))).strip()
|
| 28 |
-
|
| 29 |
-
if not MISTRAL_API_KEY:
|
| 30 |
-
logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
|
| 31 |
-
|
| 32 |
-
# Check if we're in test mode (allows operation without valid API key)
|
| 33 |
-
# Set to False to use actual API calls with Mistral API
|
| 34 |
-
TEST_MODE = False
|
| 35 |
-
|
| 36 |
-
# Model settings with fallbacks
|
| 37 |
-
OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
|
| 38 |
-
TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
|
| 39 |
-
VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # faster model that supports vision
|
| 40 |
-
|
| 41 |
-
# Image preprocessing settings optimized for historical documents
|
| 42 |
-
# These can be customized from environment variables
|
| 43 |
-
IMAGE_PREPROCESSING = {
|
| 44 |
-
"enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")), # Increased contrast for better text recognition
|
| 45 |
-
"sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
|
| 46 |
-
"denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
|
| 47 |
-
"max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")), # Increased size limit for better quality
|
| 48 |
-
"target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
|
| 49 |
-
"compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
|
| 50 |
-
# # Enhanced settings for handwritten documents
|
| 51 |
-
"handwritten": {
|
| 52 |
-
"block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
|
| 53 |
-
"constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")), # Lower constant for adaptive thresholding
|
| 54 |
-
"use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"), # Connect broken strokes
|
| 55 |
-
"dilation_iterations": int(os.environ.get("HANDWRITTEN_DILATION_ITERATIONS", "2")), # More iterations for better stroke connection
|
| 56 |
-
"dilation_kernel_size": int(os.environ.get("HANDWRITTEN_DILATION_KERNEL_SIZE", "3")) # Larger kernel for dilation
|
| 57 |
-
}
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
# OCR settings optimized for single-page performance
|
| 61 |
-
OCR_SETTINGS = {
|
| 62 |
-
"timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")), # Shorter timeout for single pages (45 seconds)
|
| 63 |
-
"max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")), # Fewer retries to avoid rate-limiting
|
| 64 |
-
"retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")), # Shorter initial retry delay for faster execution
|
| 65 |
-
"include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
|
| 66 |
-
"thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2")) # Lower thread count to prevent API rate limiting
|
| 67 |
-
}
|
|
|
|
| 4 |
Contains API key and other settings.
|
| 5 |
"""
|
| 6 |
import os
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
# Your Mistral API key - get from Hugging Face secrets or environment variable
|
| 9 |
+
# The priority order is: HF_SPACES environment var > regular environment var > empty string
|
| 10 |
+
# Note: No default API key is provided for security reasons
|
| 11 |
+
MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
|
| 12 |
+
os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
|
| 13 |
|
| 14 |
+
# Model settings
|
| 15 |
+
OCR_MODEL = "mistral-ocr-latest"
|
| 16 |
+
TEXT_MODEL = "ministral-8b-latest"
|
| 17 |
+
VISION_MODEL = "pixtral-12b-latest"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constants.py
DELETED
|
@@ -1,193 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Constants for the Historical OCR application.
|
| 3 |
-
|
| 4 |
-
This module contains all the constants used throughout the application,
|
| 5 |
-
making it easier to maintain and update values in one place.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
# API limits
|
| 9 |
-
MAX_FILE_SIZE_MB = 200
|
| 10 |
-
MAX_PAGES = 20
|
| 11 |
-
|
| 12 |
-
# Caching
|
| 13 |
-
CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
|
| 14 |
-
MAX_CACHE_ENTRIES = 20
|
| 15 |
-
|
| 16 |
-
# Image processing
|
| 17 |
-
MAX_IMAGE_DIMENSION = 2500
|
| 18 |
-
IMAGE_QUALITY = 100
|
| 19 |
-
|
| 20 |
-
# Document types
|
| 21 |
-
DOCUMENT_TYPES = [
|
| 22 |
-
"Auto-detect (standard processing)",
|
| 23 |
-
"Newspaper or Magazine",
|
| 24 |
-
"Letter or Correspondence",
|
| 25 |
-
"Book or Publication",
|
| 26 |
-
"Form or Legal Document",
|
| 27 |
-
"Recipe",
|
| 28 |
-
"Handwritten Document",
|
| 29 |
-
"Map or Illustration",
|
| 30 |
-
"Table or Spreadsheet",
|
| 31 |
-
"Other (specify in instructions)"
|
| 32 |
-
]
|
| 33 |
-
|
| 34 |
-
# Document layouts
|
| 35 |
-
DOCUMENT_LAYOUTS = [
|
| 36 |
-
"Standard layout",
|
| 37 |
-
"Multiple columns",
|
| 38 |
-
"Table/grid format",
|
| 39 |
-
"Mixed layout with images"
|
| 40 |
-
]
|
| 41 |
-
|
| 42 |
-
# Preprocessing document types
|
| 43 |
-
PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
|
| 44 |
-
|
| 45 |
-
# Rotation options
|
| 46 |
-
ROTATION_OPTIONS = [0, 90, 180, 270]
|
| 47 |
-
|
| 48 |
-
# PDF settings
|
| 49 |
-
DEFAULT_PDF_DPI = 100
|
| 50 |
-
MIN_PDF_DPI = 72
|
| 51 |
-
MAX_PDF_DPI = 300
|
| 52 |
-
DEFAULT_MAX_PAGES = 3
|
| 53 |
-
|
| 54 |
-
# Performance modes
|
| 55 |
-
PERFORMANCE_MODES = ["Quality", "Speed"]
|
| 56 |
-
|
| 57 |
-
# Custom prompt templates
|
| 58 |
-
CUSTOM_PROMPT_TEMPLATES = {
|
| 59 |
-
"Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
|
| 60 |
-
"Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
|
| 61 |
-
"Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
|
| 62 |
-
"Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
|
| 63 |
-
"Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
|
| 64 |
-
"Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
|
| 65 |
-
"Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
|
| 66 |
-
"Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
|
| 67 |
-
"Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
# Layout prompt additions
|
| 71 |
-
LAYOUT_PROMPT_ADDITIONS = {
|
| 72 |
-
"Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
|
| 73 |
-
"Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
|
| 74 |
-
"Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
# Content themes for subject tag extraction
|
| 78 |
-
CONTENT_THEMES = {
|
| 79 |
-
# Historical Periods
|
| 80 |
-
"Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"],
|
| 81 |
-
"Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"],
|
| 82 |
-
"Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"],
|
| 83 |
-
"Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"],
|
| 84 |
-
"Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"],
|
| 85 |
-
"18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"],
|
| 86 |
-
"19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"],
|
| 87 |
-
"20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"],
|
| 88 |
-
"Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"],
|
| 89 |
-
|
| 90 |
-
# Geographic Contexts
|
| 91 |
-
"European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"],
|
| 92 |
-
"Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"],
|
| 93 |
-
"African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"],
|
| 94 |
-
"American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"],
|
| 95 |
-
"Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"],
|
| 96 |
-
"Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"],
|
| 97 |
-
|
| 98 |
-
# Historical Methodologies & Approaches
|
| 99 |
-
"Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"],
|
| 100 |
-
"Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"],
|
| 101 |
-
"Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"],
|
| 102 |
-
"Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"],
|
| 103 |
-
"Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"],
|
| 104 |
-
|
| 105 |
-
# Historical Document Types
|
| 106 |
-
"Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"],
|
| 107 |
-
"Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"],
|
| 108 |
-
"Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"],
|
| 109 |
-
"Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"],
|
| 110 |
-
"Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"],
|
| 111 |
-
"Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"],
|
| 112 |
-
|
| 113 |
-
# Historical Themes & Movements
|
| 114 |
-
"Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"],
|
| 115 |
-
"Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"],
|
| 116 |
-
"Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"],
|
| 117 |
-
"Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"],
|
| 118 |
-
"Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"],
|
| 119 |
-
"Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"],
|
| 120 |
-
"Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"],
|
| 121 |
-
"Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"],
|
| 122 |
-
"Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"],
|
| 123 |
-
|
| 124 |
-
# Specialized Historical Topics
|
| 125 |
-
"Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"],
|
| 126 |
-
"Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"],
|
| 127 |
-
"Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"],
|
| 128 |
-
"Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"],
|
| 129 |
-
"Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"],
|
| 130 |
-
"Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"],
|
| 131 |
-
"Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"],
|
| 132 |
-
"Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"],
|
| 133 |
-
|
| 134 |
-
# General Historical Terms
|
| 135 |
-
"Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"],
|
| 136 |
-
"Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"],
|
| 137 |
-
"Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"]
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
-
# Period tags based on year ranges
|
| 141 |
-
# These ranges are used to assign historical period tags to documents based on their year.
|
| 142 |
-
PERIOD_TAGS = {
|
| 143 |
-
(0, 499): "Ancient Era (to 500 CE)",
|
| 144 |
-
(500, 999): "Early Medieval (500–1000)",
|
| 145 |
-
(1000, 1299): "High Medieval (1000–1300)",
|
| 146 |
-
(1300, 1499): "Late Medieval (1300–1500)",
|
| 147 |
-
(1500, 1599): "Renaissance (1500–1600)",
|
| 148 |
-
(1600, 1699): "Early Modern (1600–1700)",
|
| 149 |
-
(1700, 1775): "Enlightenment (1700–1775)",
|
| 150 |
-
(1776, 1799): "Age of Revolutions (1776–1800)",
|
| 151 |
-
(1800, 1849): "Early 19th Century (1800–1850)",
|
| 152 |
-
(1850, 1899): "Late 19th Century (1850–1900)",
|
| 153 |
-
(1900, 1918): "Early 20th Century & WWI (1900–1918)",
|
| 154 |
-
(1919, 1938): "Interwar Period (1919–1938)",
|
| 155 |
-
(1939, 1945): "World War II (1939–1945)",
|
| 156 |
-
(1946, 1968): "Postwar & Mid-20th Century (1946–1968)",
|
| 157 |
-
(1969, 1989): "Late 20th Century (1969–1989)",
|
| 158 |
-
(1990, 2000): "Turn of the 21st Century (1990–2000)",
|
| 159 |
-
(2001, 2099): "Contemporary (21st Century)"
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
# Default fallback tags for documents when no specific tags are detected.
|
| 163 |
-
DEFAULT_TAGS = [
|
| 164 |
-
"Document",
|
| 165 |
-
"Historical",
|
| 166 |
-
"Text",
|
| 167 |
-
"Primary Source",
|
| 168 |
-
"Archival Material",
|
| 169 |
-
"Record",
|
| 170 |
-
"Manuscript",
|
| 171 |
-
"Printed Material",
|
| 172 |
-
"Correspondence",
|
| 173 |
-
"Publication"
|
| 174 |
-
]
|
| 175 |
-
|
| 176 |
-
# Generic tags that can be used for broad categorization or as supplemental tags.
|
| 177 |
-
GENERIC_TAGS = [
|
| 178 |
-
"Archive",
|
| 179 |
-
"Content",
|
| 180 |
-
"Record",
|
| 181 |
-
"Source",
|
| 182 |
-
"Material",
|
| 183 |
-
"Page",
|
| 184 |
-
"Scan",
|
| 185 |
-
"Image",
|
| 186 |
-
"Transcription",
|
| 187 |
-
"Uncategorized",
|
| 188 |
-
"General",
|
| 189 |
-
"Miscellaneous"
|
| 190 |
-
]
|
| 191 |
-
|
| 192 |
-
# UI constants
|
| 193 |
-
PROGRESS_DELAY = 0.8 # Seconds to show completion message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
error_handler.py
DELETED
|
@@ -1,65 +0,0 @@
|
|
| 1 |
-
import logging
|
| 2 |
-
import streamlit as st
|
| 3 |
-
import time
|
| 4 |
-
from constants import MAX_FILE_SIZE_MB
|
| 5 |
-
|
| 6 |
-
# Configure logging
|
| 7 |
-
logger = logging.getLogger("error_handler")
|
| 8 |
-
logger.setLevel(logging.INFO)
|
| 9 |
-
|
| 10 |
-
def handle_ocr_error(exception, progress_reporter=None):
|
| 11 |
-
"""
|
| 12 |
-
Handle OCR processing errors and provide user-friendly messages
|
| 13 |
-
|
| 14 |
-
Args:
|
| 15 |
-
exception: The exception that occurred
|
| 16 |
-
progress_reporter: ProgressReporter instance for UI updates
|
| 17 |
-
|
| 18 |
-
Returns:
|
| 19 |
-
str: User-friendly error message
|
| 20 |
-
"""
|
| 21 |
-
error_message = str(exception)
|
| 22 |
-
|
| 23 |
-
# Complete progress reporting if provided
|
| 24 |
-
if progress_reporter:
|
| 25 |
-
progress_reporter.complete(success=False)
|
| 26 |
-
|
| 27 |
-
# Check for specific error types and provide helpful user-facing messages
|
| 28 |
-
if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
|
| 29 |
-
friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
|
| 30 |
-
logger.error(f"Rate limit error: {error_message}")
|
| 31 |
-
return friendly_message
|
| 32 |
-
elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
|
| 33 |
-
friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
|
| 34 |
-
logger.error(f"API quota error: {error_message}")
|
| 35 |
-
return friendly_message
|
| 36 |
-
elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
|
| 37 |
-
friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
|
| 38 |
-
logger.error(f"Timeout error: {error_message}")
|
| 39 |
-
return friendly_message
|
| 40 |
-
elif "file size" in error_message.lower() or "too large" in error_message.lower():
|
| 41 |
-
friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
|
| 42 |
-
logger.error(f"File size error: {error_message}")
|
| 43 |
-
return friendly_message
|
| 44 |
-
else:
|
| 45 |
-
# Generic error message for other errors
|
| 46 |
-
logger.error(f"OCR processing error: {error_message}", exc_info=True)
|
| 47 |
-
return f"An error occurred during processing: {error_message}"
|
| 48 |
-
|
| 49 |
-
def check_file_size(file_bytes):
|
| 50 |
-
"""
|
| 51 |
-
Check if file size is within limits
|
| 52 |
-
|
| 53 |
-
Args:
|
| 54 |
-
file_bytes: File content as bytes
|
| 55 |
-
|
| 56 |
-
Returns:
|
| 57 |
-
tuple: (is_valid, file_size_mb, error_message)
|
| 58 |
-
"""
|
| 59 |
-
file_size_mb = len(file_bytes) / (1024 * 1024)
|
| 60 |
-
|
| 61 |
-
if file_size_mb > MAX_FILE_SIZE_MB:
|
| 62 |
-
error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
|
| 63 |
-
return False, file_size_mb, error_message
|
| 64 |
-
|
| 65 |
-
return True, file_size_mb, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_segmentation.py
DELETED
|
@@ -1,253 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Image segmentation utility for OCR preprocessing.
|
| 3 |
-
Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
|
| 4 |
-
Uses content-aware adaptive segmentation for improved results across document types.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import cv2
|
| 8 |
-
import numpy as np
|
| 9 |
-
from PIL import Image
|
| 10 |
-
import io
|
| 11 |
-
import base64
|
| 12 |
-
import logging
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
from typing import Tuple, List, Dict, Union, Optional
|
| 15 |
-
|
| 16 |
-
# Configure logging
|
| 17 |
-
logging.basicConfig(level=logging.INFO,
|
| 18 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 19 |
-
logger = logging.getLogger(__name__)
|
| 20 |
-
|
| 21 |
-
def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
|
| 22 |
-
"""
|
| 23 |
-
Prepare image for OCR processing using content-aware segmentation.
|
| 24 |
-
Uses adaptive region detection based on text density analysis.
|
| 25 |
-
|
| 26 |
-
Args:
|
| 27 |
-
image_path: Path to the image file
|
| 28 |
-
vision_enabled: Whether the vision model is enabled
|
| 29 |
-
preserve_content: Whether to preserve original content without enhancement
|
| 30 |
-
|
| 31 |
-
Returns:
|
| 32 |
-
Dict containing segmentation results
|
| 33 |
-
"""
|
| 34 |
-
# Convert to Path object if string
|
| 35 |
-
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
| 36 |
-
|
| 37 |
-
# Log start of processing
|
| 38 |
-
logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
|
| 39 |
-
|
| 40 |
-
try:
|
| 41 |
-
# Open original image with PIL
|
| 42 |
-
with Image.open(image_file) as pil_img:
|
| 43 |
-
# Check for low entropy images when vision is disabled
|
| 44 |
-
if not vision_enabled:
|
| 45 |
-
from utils.image_utils import calculate_image_entropy
|
| 46 |
-
ent = calculate_image_entropy(pil_img)
|
| 47 |
-
if ent < 3.5: # Likely line-art or blank page
|
| 48 |
-
logger.info(f"Low entropy image detected ({ent:.2f}), classifying as illustration")
|
| 49 |
-
return {
|
| 50 |
-
'text_regions': None,
|
| 51 |
-
'image_regions': pil_img,
|
| 52 |
-
'text_mask_base64': None,
|
| 53 |
-
'combined_result': None,
|
| 54 |
-
'text_regions_coordinates': []
|
| 55 |
-
}
|
| 56 |
-
|
| 57 |
-
# Convert to RGB if needed
|
| 58 |
-
if pil_img.mode != 'RGB':
|
| 59 |
-
pil_img = pil_img.convert('RGB')
|
| 60 |
-
|
| 61 |
-
# Get image dimensions
|
| 62 |
-
img_np = np.array(pil_img)
|
| 63 |
-
img_width, img_height = pil_img.size
|
| 64 |
-
|
| 65 |
-
# Analyze text density to determine if advanced segmentation is needed
|
| 66 |
-
# This replaces document-specific logic with content-aware analysis
|
| 67 |
-
from utils.image_utils import estimate_text_density
|
| 68 |
-
text_density = estimate_text_density(img_np)
|
| 69 |
-
|
| 70 |
-
# Use adaptive approach for documents with unusual text distribution
|
| 71 |
-
if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
|
| 72 |
-
logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
|
| 73 |
-
|
| 74 |
-
# Detect content regions based on text density
|
| 75 |
-
from utils.text_utils import detect_content_regions
|
| 76 |
-
regions = detect_content_regions(img_np)
|
| 77 |
-
|
| 78 |
-
# Create visualization with green borders around the text regions
|
| 79 |
-
vis_img = img_np.copy()
|
| 80 |
-
|
| 81 |
-
# Draw regions on visualization
|
| 82 |
-
for x, y, w, h in regions:
|
| 83 |
-
cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
|
| 84 |
-
|
| 85 |
-
# Add text to indicate we're using adaptive processing
|
| 86 |
-
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 87 |
-
cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
|
| 88 |
-
|
| 89 |
-
# Create visualization images
|
| 90 |
-
text_regions_vis = Image.fromarray(vis_img)
|
| 91 |
-
image_regions_vis = text_regions_vis.copy()
|
| 92 |
-
|
| 93 |
-
# Create a mask highlighting the text regions
|
| 94 |
-
text_mask = np.zeros((img_height, img_width), dtype=np.uint8)
|
| 95 |
-
for x, y, w, h in regions:
|
| 96 |
-
text_mask[y:y+h, x:x+w] = 255
|
| 97 |
-
|
| 98 |
-
_, buffer = cv2.imencode('.png', text_mask)
|
| 99 |
-
text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 100 |
-
|
| 101 |
-
# Extract region images
|
| 102 |
-
region_images = []
|
| 103 |
-
for i, (x, y, w, h) in enumerate(regions):
|
| 104 |
-
region = img_np[y:y+h, x:x+w].copy()
|
| 105 |
-
region_pil = Image.fromarray(region)
|
| 106 |
-
|
| 107 |
-
region_info = {
|
| 108 |
-
'image': region,
|
| 109 |
-
'pil_image': region_pil,
|
| 110 |
-
'coordinates': (x, y, w, h),
|
| 111 |
-
'padded_coordinates': (x, y, w, h),
|
| 112 |
-
'order': i
|
| 113 |
-
}
|
| 114 |
-
region_images.append(region_info)
|
| 115 |
-
|
| 116 |
-
# Return the adaptive segmentation results
|
| 117 |
-
return {
|
| 118 |
-
'text_regions': text_regions_vis,
|
| 119 |
-
'image_regions': image_regions_vis,
|
| 120 |
-
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
| 121 |
-
'combined_result': pil_img,
|
| 122 |
-
'text_regions_coordinates': regions,
|
| 123 |
-
'region_images': region_images,
|
| 124 |
-
'segmentation_type': 'adaptive'
|
| 125 |
-
}
|
| 126 |
-
else:
|
| 127 |
-
# SIMPLIFIED APPROACH for most documents
|
| 128 |
-
# Let Mistral OCR handle the entire document understanding process
|
| 129 |
-
logger.info(f"Using standard approach for document with uniform text density")
|
| 130 |
-
|
| 131 |
-
# For visualization, mark the entire image as a text region
|
| 132 |
-
full_image_region = [(0, 0, img_width, img_height)]
|
| 133 |
-
|
| 134 |
-
# Create visualization with a simple border
|
| 135 |
-
vis_img = img_np.copy()
|
| 136 |
-
cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
|
| 137 |
-
|
| 138 |
-
# Add text to indicate this is using Mistral's native processing
|
| 139 |
-
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 140 |
-
cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
|
| 141 |
-
|
| 142 |
-
# Create visualizations and masks
|
| 143 |
-
text_regions_vis = Image.fromarray(vis_img)
|
| 144 |
-
image_regions_vis = text_regions_vis.copy()
|
| 145 |
-
|
| 146 |
-
# Create a mask of the entire image (just for visualization)
|
| 147 |
-
text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
|
| 148 |
-
_, buffer = cv2.imencode('.png', text_mask)
|
| 149 |
-
text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
|
| 150 |
-
|
| 151 |
-
# Return the original image as the combined result
|
| 152 |
-
return {
|
| 153 |
-
'text_regions': text_regions_vis,
|
| 154 |
-
'image_regions': image_regions_vis,
|
| 155 |
-
'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
|
| 156 |
-
'combined_result': pil_img,
|
| 157 |
-
'text_regions_coordinates': full_image_region,
|
| 158 |
-
'region_images': [{
|
| 159 |
-
'image': img_np,
|
| 160 |
-
'pil_image': pil_img,
|
| 161 |
-
'coordinates': (0, 0, img_width, img_height),
|
| 162 |
-
'padded_coordinates': (0, 0, img_width, img_height),
|
| 163 |
-
'order': 0
|
| 164 |
-
}],
|
| 165 |
-
'segmentation_type': 'simplified'
|
| 166 |
-
}
|
| 167 |
-
|
| 168 |
-
except Exception as e:
|
| 169 |
-
logger.error(f"Error segmenting image {image_file.name}: {str(e)}")
|
| 170 |
-
# Return None values if processing fails
|
| 171 |
-
return {
|
| 172 |
-
'text_regions': None,
|
| 173 |
-
'image_regions': None,
|
| 174 |
-
'text_mask_base64': None,
|
| 175 |
-
'combined_result': None,
|
| 176 |
-
'text_regions_coordinates': []
|
| 177 |
-
}
|
| 178 |
-
|
| 179 |
-
def process_segmented_image(image_path: Union[str, Path], output_dir: Optional[Path] = None, preserve_content: bool = True) -> Dict:
|
| 180 |
-
"""
|
| 181 |
-
Process an image using segmentation for improved OCR, saving visualization outputs.
|
| 182 |
-
|
| 183 |
-
Args:
|
| 184 |
-
image_path: Path to the image file
|
| 185 |
-
output_dir: Optional directory to save visualization outputs
|
| 186 |
-
|
| 187 |
-
Returns:
|
| 188 |
-
Dictionary with processing results and paths to output files
|
| 189 |
-
"""
|
| 190 |
-
# Convert to Path object if string
|
| 191 |
-
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
| 192 |
-
|
| 193 |
-
# Create output directory if not provided
|
| 194 |
-
if output_dir is None:
|
| 195 |
-
output_dir = Path("output") / "segmentation"
|
| 196 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 197 |
-
|
| 198 |
-
# Process the image with segmentation
|
| 199 |
-
segmentation_results = segment_image_for_ocr(image_file)
|
| 200 |
-
|
| 201 |
-
# Prepare results dictionary
|
| 202 |
-
results = {
|
| 203 |
-
'original_image': str(image_file),
|
| 204 |
-
'output_files': {}
|
| 205 |
-
}
|
| 206 |
-
|
| 207 |
-
# Save visualization outputs if segmentation was successful
|
| 208 |
-
if segmentation_results['text_regions'] is not None:
|
| 209 |
-
# Save text regions visualization
|
| 210 |
-
text_regions_path = output_dir / f"{image_file.stem}_text_regions.jpg"
|
| 211 |
-
segmentation_results['text_regions'].save(text_regions_path)
|
| 212 |
-
results['output_files']['text_regions'] = str(text_regions_path)
|
| 213 |
-
|
| 214 |
-
# Save image regions visualization
|
| 215 |
-
image_regions_path = output_dir / f"{image_file.stem}_image_regions.jpg"
|
| 216 |
-
segmentation_results['image_regions'].save(image_regions_path)
|
| 217 |
-
results['output_files']['image_regions'] = str(image_regions_path)
|
| 218 |
-
|
| 219 |
-
# Save combined result
|
| 220 |
-
combined_path = output_dir / f"{image_file.stem}_combined.jpg"
|
| 221 |
-
segmentation_results['combined_result'].save(combined_path)
|
| 222 |
-
results['output_files']['combined_result'] = str(combined_path)
|
| 223 |
-
|
| 224 |
-
# Save text mask visualization
|
| 225 |
-
text_mask_path = output_dir / f"{image_file.stem}_text_mask.png"
|
| 226 |
-
# Save text mask from base64
|
| 227 |
-
if segmentation_results['text_mask_base64']:
|
| 228 |
-
base64_data = segmentation_results['text_mask_base64'].split(',')[1]
|
| 229 |
-
with open(text_mask_path, 'wb') as f:
|
| 230 |
-
f.write(base64.b64decode(base64_data))
|
| 231 |
-
results['output_files']['text_mask'] = str(text_mask_path)
|
| 232 |
-
|
| 233 |
-
# Add detected text regions count
|
| 234 |
-
results['text_regions_count'] = len(segmentation_results['text_regions_coordinates'])
|
| 235 |
-
results['text_regions_coordinates'] = segmentation_results['text_regions_coordinates']
|
| 236 |
-
|
| 237 |
-
return results
|
| 238 |
-
|
| 239 |
-
if __name__ == "__main__":
|
| 240 |
-
# Simple test - process a sample image if run directly
|
| 241 |
-
import sys
|
| 242 |
-
|
| 243 |
-
if len(sys.argv) > 1:
|
| 244 |
-
image_path = sys.argv[1]
|
| 245 |
-
else:
|
| 246 |
-
image_path = "input/handwritten-journal.jpg" # Example image path"
|
| 247 |
-
|
| 248 |
-
logger.info(f"Testing image segmentation on {image_path}")
|
| 249 |
-
results = process_segmented_image(image_path)
|
| 250 |
-
|
| 251 |
-
# Print results summary
|
| 252 |
-
logger.info(f"Segmentation complete. Found {results.get('text_regions_count', 0)} text regions.")
|
| 253 |
-
logger.info(f"Output files saved to: {[path for path in results.get('output_files', {}).values()]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
input/The Magician, or Bottle Cungerer.jpeg
ADDED
|
Git LFS Details
|
input/baldwin-letter-1.jpg
ADDED
|
Git LFS Details
|
input/baldwin-letter-2.jpg
ADDED
|
Git LFS Details
|
input/flier.png
ADDED
|
input/harpers.pdf
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3c9030714b07bb5f7c9adf8b175975baa9b4f40402da62d69cad9b0d4ba61b94
|
| 3 |
-
size 14931299
|
|
|
|
|
|
|
|
|
|
|
|
input/letter-1.jpg
ADDED
|
Git LFS Details
|
input/letter-2.jpg
ADDED
|
Git LFS Details
|
input/letter-3.jpg
ADDED
|
Git LFS Details
|
input/magician-satire.jpg
ADDED
|
Git LFS Details
|
input/menu.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
| 3 |
+
size 2554815
|
input/milgram-flier.png
CHANGED
|
Git LFS Details
|
|
input/okeefe-menu.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
| 3 |
+
size 2554815
|
input/okeefe-recipe.jpg
ADDED
|
input/recipe.jpg
CHANGED
|
Git LFS Details
|
|