File size: 26,478 Bytes
85bdb4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
import os
import streamlit as st
import json
import sys
import time
from pathlib import Path
import tempfile
import io
from pdf2image import convert_from_bytes
from PIL import Image, ImageEnhance, ImageFilter
import cv2
import numpy as np

# Import the StructuredOCR class and config from the local files
from structured_ocr import StructuredOCR
from config import MISTRAL_API_KEY

# Set page configuration
st.set_page_config(
    page_title="Historical OCR",
    page_icon="🚀",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Enable caching for expensive operations
@st.cache_data(ttl=3600, show_spinner=False)
def convert_pdf_to_images(pdf_bytes, dpi=150):
    """Convert PDF bytes to a list of images with caching"""
    try:
        return convert_from_bytes(pdf_bytes, dpi=dpi)
    except Exception as e:
        st.error(f"Error converting PDF: {str(e)}")
        return []

@st.cache_data(ttl=3600, show_spinner=False)
def preprocess_image(image_bytes, preprocessing_options):
    """Preprocess image with selected options"""
    # Convert bytes to OpenCV format
    image = Image.open(io.BytesIO(image_bytes))
    img_array = np.array(image)
    
    # Apply preprocessing based on selected options
    if preprocessing_options.get("grayscale", False):
        img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
    
    if preprocessing_options.get("contrast", 0) != 0:
        contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
        image = Image.fromarray(img_array)
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(contrast_factor)
        img_array = np.array(image)
    
    if preprocessing_options.get("denoise", False):
        img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
        
    if preprocessing_options.get("threshold", False):
        # Convert to grayscale if not already
        if len(img_array.shape) == 3:
            gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        else:
            gray = img_array
        # Apply adaptive threshold
        binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                      cv2.THRESH_BINARY, 11, 2)
        # Convert back to RGB
        img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
        
    # Convert back to PIL Image
    processed_image = Image.fromarray(img_array)
    
    # Convert to bytes
    byte_io = io.BytesIO()
    processed_image.save(byte_io, format='PNG')
    byte_io.seek(0)
    
    return byte_io.getvalue()

# Define functions
def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
    """Process the uploaded file and return the OCR results
    
    Args:
        uploaded_file: The uploaded file to process
        use_vision: Whether to use vision model
        preprocessing_options: Dictionary of preprocessing options
    """
    if preprocessing_options is None:
        preprocessing_options = {}
        
    # Show progress indicator
    progress_bar = st.progress(0)
    status_text = st.empty()
    status_text.text("Preparing file for processing...")
    
    # Save the uploaded file to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
        tmp.write(uploaded_file.getvalue())
        temp_path = tmp.name
    
    try:
        # Check if API key is available
        if not MISTRAL_API_KEY:
            # Return dummy data if no API key
            progress_bar.progress(100)
            status_text.empty()
            return {
                "file_name": uploaded_file.name,
                "topics": ["Sample Document"],
                "languages": ["English"],
                "ocr_contents": {
                    "title": "Sample Document",
                    "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
                }
            }
        
        # Update progress
        progress_bar.progress(20)
        status_text.text("Initializing OCR processor...")
        
        # Initialize OCR processor
        processor = StructuredOCR()
        
        # Determine file type from extension
        file_ext = Path(uploaded_file.name).suffix.lower()
        file_type = "pdf" if file_ext == ".pdf" else "image"
        
        # Apply preprocessing if needed
        if any(preprocessing_options.values()) and file_type == "image":
            status_text.text("Applying image preprocessing...")
            processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
            
            # Save processed image to temp file
            with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
                proc_tmp.write(processed_bytes)
                temp_path = proc_tmp.name
        
        # Get file size in MB
        file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
        
        # Check if file exceeds API limits (50 MB)
        if file_size_mb > 50:
            st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
            return {
                "file_name": uploaded_file.name,
                "topics": ["Document"],
                "languages": ["English"],
                "confidence_score": 0.0,
                "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
                "ocr_contents": {
                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
                    "partial_text": "Document could not be processed due to size limitations."
                }
            }
        
        # Update progress
        progress_bar.progress(40)
        status_text.text("Processing document with OCR...")
        
        # Process the file with file size information for automatic page limiting
        # Make sure we're using the latest mistral-ocr model 
        # See https://docs.mistral.ai/capabilities/document/ for more info
        result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
        
        # Complete progress
        progress_bar.progress(100)
        status_text.empty()
        
        return result
    except Exception as e:
        progress_bar.progress(100)
        status_text.empty()
        st.error(f"Error during processing: {str(e)}")
        raise
    finally:
        # Clean up the temporary file
        if os.path.exists(temp_path):
            os.unlink(temp_path)

# App title and description
st.title("Historical Document OCR")
st.subheader("Powered by Mistral AI")

# Create main layout with tabs and columns
main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])

with main_tab1:
    # Create a two-column layout for file upload and preview
    upload_col, preview_col = st.columns([1, 1])
    
    # File uploader in the left column
    with upload_col:
        st.markdown("""
        Upload an image or PDF file to get started. 
        
        Using the latest `mistral-ocr-latest` model for advanced document understanding.
        """)
        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])

# Sidebar with options
with st.sidebar:
    st.header("Options")
    
    # Model options
    st.subheader("Model Settings")
    use_vision = st.checkbox("Use Vision Model", value=True, 
                            help="For image files, use the vision model for improved analysis (may be slower)")
    
    # Image preprocessing options (collapsible)
    st.subheader("Image Preprocessing")
    with st.expander("Preprocessing Options"):
        preprocessing_options = {}
        preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale", 
                                                        help="Convert image to grayscale before OCR")
        preprocessing_options["threshold"] = st.checkbox("Apply Thresholding", 
                                                      help="Apply adaptive thresholding to enhance text")
        preprocessing_options["denoise"] = st.checkbox("Denoise Image", 
                                                     help="Remove noise from the image")
        preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0, 
                                                    help="Adjust image contrast (-5 to +5)")
    
    # PDF options (collapsible)
    st.subheader("PDF Options")
    with st.expander("PDF Settings"):
        pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150, 
                          help="Higher DPI gives better quality but slower processing")
        max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5, 
                                  help="Limit number of pages to process")

# About tab content
with main_tab2:
    st.markdown("""
    ### About This Application
    
    This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
    
    It can process:
    - Image files (jpg, png, etc.)
    - PDF documents (multi-page support)
    
    The extracted content is processed into structured data based on the document type, combining:
    - Text extraction with `mistral-ocr-latest`
    - Analysis with language models
    - Layout preservation with images
    
    View results in three formats:
    - Structured HTML view
    - Raw JSON (for developers)
    - Markdown with images (preserves document layout)
    
    **New Features:**
    - Image preprocessing for better OCR quality
    - PDF resolution and page controls
    - Progress tracking during processing
    """)

with main_tab1:
    if uploaded_file is not None:
        # Check file size (cap at 50MB)
        file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
        
        if file_size_mb > 50:
            with upload_col:
                st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
            st.stop()
        
        file_ext = Path(uploaded_file.name).suffix.lower()
        
        # Display document preview in preview column
        with preview_col:
            st.subheader("Document Preview")
            if file_ext == ".pdf":
                try:
                    # Convert first page of PDF to image for preview
                    pdf_bytes = uploaded_file.getvalue()
                    images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
                    
                    if images:
                        # Convert PIL image to bytes for Streamlit
                        first_page = images[0]
                        img_bytes = io.BytesIO()
                        first_page.save(img_bytes, format='JPEG')
                        img_bytes.seek(0)
                        
                        # Display the PDF preview
                        st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
                    else:
                        st.info(f"PDF uploaded: {uploaded_file.name}")
                except Exception:
                    # Simply show the file name without an error message
                    st.info(f"PDF uploaded: {uploaded_file.name}")
                    st.info("Click 'Process Document' to analyze the content.")
            else:
                st.image(uploaded_file, use_container_width=True)

        # Add image preprocessing preview in a collapsible section if needed
        if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
            with st.expander("Image Preprocessing Preview"):
                preview_cols = st.columns(2)
                
                with preview_cols[0]:
                    st.markdown("**Original Image**")
                    st.image(uploaded_file, use_container_width=True)
                
                with preview_cols[1]:
                    st.markdown("**Preprocessed Image**")
                    try:
                        processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
                        st.image(io.BytesIO(processed_bytes), use_container_width=True)
                    except Exception as e:
                        st.error(f"Error in preprocessing: {str(e)}")
        
        # Process button - flush left with similar padding as file browser
        with upload_col:
            process_button = st.button("Process Document", use_container_width=True)
        
        # Results section
        if process_button:
            try:
                # Get max_pages or default if not available
                max_pages_value = max_pages if 'max_pages' in locals() else None
                
                # Call process_file with all options
                result = process_file(uploaded_file, use_vision, preprocessing_options)
                
                # Create results tabs for better organization
                results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
                
                with results_tab1:
                    # Create two columns for metadata and content
                    meta_col, content_col = st.columns([1, 2])
                    
                    with meta_col:
                        st.subheader("Document Metadata")
                        st.success("**Document processed successfully**")
                        
                        # Display file info
                        st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
                        
                        # Display info if only limited pages were processed
                        if 'limited_pages' in result:
                            st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
                        
                        # Display languages if available
                        if 'languages' in result:
                            languages = [lang for lang in result['languages'] if lang is not None]
                            if languages:
                                st.write(f"**Languages:** {', '.join(languages)}")
                                
                        # Confidence score if available
                        if 'confidence_score' in result:
                            confidence = result['confidence_score']
                            st.write(f"**OCR Confidence:** {confidence:.1%}")
                        
                        # Display topics if available
                        if 'topics' in result and result['topics']:
                            st.write(f"**Topics:** {', '.join(result['topics'])}")
                    
                    with content_col:
                        st.subheader("Document Contents")
                        if 'ocr_contents' in result:
                            # Check if there are images in the OCR result
                            has_images = False
                            if 'raw_response' in result:
                                try:
                                    has_images = any(page.images for page in result['raw_response'].pages)
                                except Exception:
                                    has_images = False
                            
                            # Create tabs for different views
                            if has_images:
                                view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
                            else:
                                view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
                            
                            with view_tab1:
                                # Display in a more user-friendly format based on the content structure
                                html_content = ""
                                if isinstance(result['ocr_contents'], dict):
                                    for section, content in result['ocr_contents'].items():
                                        if content:  # Only display non-empty sections
                                            section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
                                            html_content += section_title
                                            
                                            if isinstance(content, str):
                                                html_content += f"<p>{content}</p>"
                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
                                                st.markdown(content)
                                            elif isinstance(content, list):
                                                html_list = "<ul>"
                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
                                                for item in content:
                                                    if isinstance(item, str):
                                                        html_list += f"<li>{item}</li>"
                                                        st.markdown(f"- {item}")
                                                    elif isinstance(item, dict):
                                                        html_list += f"<li>{json.dumps(item)}</li>"
                                                        st.json(item)
                                                html_list += "</ul>"
                                                html_content += html_list
                                            elif isinstance(content, dict):
                                                html_dict = "<dl>"
                                                st.markdown(f"#### {section.replace('_', ' ').title()}")
                                                for k, v in content.items():
                                                    html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
                                                    st.markdown(f"**{k}:** {v}")
                                                html_dict += "</dl>"
                                                html_content += html_dict
                                
                                # Add download button in a smaller section
                                with st.expander("Export Content"):
                                    # Alternative download button
                                    html_bytes = html_content.encode()
                                    st.download_button(
                                        label="Download as HTML",
                                        data=html_bytes,
                                        file_name="document_content.html",
                                        mime="text/html"
                                    )
                            
                            with view_tab2:
                                # Show the raw JSON for developers
                                st.json(result)
                            
                            if has_images:
                                with view_tab3:
                                    # Show loading indicator while preparing images
                                    with st.spinner("Preparing document with embedded images..."):
                                        try:
                                            # Import function
                                            try:
                                                from ocr_utils import get_combined_markdown
                                            except ImportError:
                                                st.error("Required module ocr_utils not found.")
                                                st.stop()
                                            
                                            # Check if raw_response is available
                                            if 'raw_response' not in result:
                                                st.warning("Raw OCR response not available. Cannot display images.")
                                                st.stop()
                                            
                                            # Validate the raw_response structure before processing
                                            if not hasattr(result['raw_response'], 'pages'):
                                                st.warning("Invalid OCR response format. Cannot display images.")
                                                st.stop()
                                                
                                            # Get the combined markdown with images
                                            combined_markdown = get_combined_markdown(result['raw_response'])
                                            
                                            if not combined_markdown or combined_markdown.strip() == "":
                                                st.warning("No image content found in the document.")
                                                st.stop()
                                            
                                            # Add CSS to ensure proper spacing and handling of text and images
                                            st.markdown("""
                                            <style>
                                            .markdown-text-container {
                                                padding: 10px;
                                                background-color: #f9f9f9;
                                                border-radius: 5px;
                                            }
                                            .markdown-text-container img {
                                                margin: 15px 0;
                                                max-width: 100%;
                                                border: 1px solid #ddd;
                                                border-radius: 4px;
                                                display: block;
                                            }
                                            .markdown-text-container p {
                                                margin-bottom: 16px;
                                                line-height: 1.6;
                                            }
                                            </style>
                                            """, unsafe_allow_html=True)
                                            
                                            # Wrap the markdown in a div with the class for styling
                                            st.markdown(f"""
                                            <div class="markdown-text-container">
                                            {combined_markdown}
                                            </div>
                                            """, unsafe_allow_html=True)
                                            
                                            # Add a download button for the combined content
                                            st.download_button(
                                                label="Download with Images (HTML)",
                                                data=f"""
                                                <html>
                                                <head>
                                                    <style>
                                                    body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
                                                    img {{ max-width: 100%; margin: 15px 0; }}
                                                    </style>
                                                </head>
                                                <body>
                                                {combined_markdown}
                                                </body>
                                                </html>
                                                """,
                                                file_name="document_with_images.html",
                                                mime="text/html"
                                            )
                                            
                                        except Exception as e:
                                            st.error(f"Could not display document with images: {str(e)}")
                                            st.info("Try refreshing or processing the document again.")
                        else:
                            st.error("No OCR content was extracted from the document.")
                
                with results_tab2:
                    st.subheader("Raw Processing Results")
                    st.json(result)
                    
            except Exception as e:
                st.error(f"Error processing document: {str(e)}")
    else:
        # Display sample images in the main area when no file is uploaded
        st.info("Upload a document to get started using the file uploader above.")
        
        # Show example images in a grid
        st.subheader("Example Documents")
        
        # Add a sample images container
        with st.container():
            # Find sample images from the input directory to display
            input_dir = Path(__file__).parent / "input"
            sample_images = []
            if input_dir.exists():
                sample_images = list(input_dir.glob("*.jpg"))[:3]  # Limit to 3 samples
            
            if sample_images:
                columns = st.columns(3)
                for i, img_path in enumerate(sample_images):
                    with columns[i % 3]:
                        st.image(str(img_path), caption=img_path.name, use_container_width=True)