', unsafe_allow_html=True)
-
- # Handle sample document recreation if needed
- if process_button and st.session_state.processed_document_active and st.session_state.original_sample_bytes is not None:
- # Recreate the uploaded file from stored bytes
- from io import BytesIO
- import mimetypes
-
- # Determine mime type based on file extension
- file_ext = os.path.splitext(st.session_state.original_sample_name)[1].lower()
- if file_ext == '.pdf':
- mime_type = 'application/pdf'
- elif file_ext in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif file_ext == '.png':
- mime_type = 'image/png'
- else:
- mime_type = mimetypes.guess_type(st.session_state.original_sample_name)[0] or 'application/octet-stream'
+ # Apply preprocessing if needed
+ if any(preprocessing_options.values()) and file_type == "image":
+ status_text.text("Applying image preprocessing...")
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
- # Create a synthetic file-like object with the same interface as UploadedFile
- uploaded_file = type('obj', (object,), {
- 'name': st.session_state.original_sample_name,
- 'getvalue': lambda: st.session_state.original_sample_bytes,
- 'read': lambda: st.session_state.original_sample_bytes,
- 'seek': lambda x: None,
- 'type': mime_type
- })
+ # Save processed image to temp file
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
+ proc_tmp.write(processed_bytes)
+ temp_path = proc_tmp.name
- # Empty container for progress indicators - will be filled during processing
- # Positioned right after the process button for better visibility
- progress_placeholder = st.empty()
+ # Get file size in MB
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
- # Image preprocessing preview - show if image file and preprocessing options are set
- # Remove the document active check to show preview immediately after selection
- if (any(sidebar_options["preprocessing_options"].values()) and
- uploaded_file.type.startswith('image/')):
-
- st.markdown("**Preprocessed Preview**")
- try:
- # Create a container for the preview
- with st.container():
- processed_bytes = preprocess_image(uploaded_file.getvalue(), sidebar_options["preprocessing_options"])
- # Convert image to base64 and display as HTML to avoid fullscreen button
- img_data = base64.b64encode(processed_bytes).decode()
- img_html = f''
- st.markdown(img_html, unsafe_allow_html=True)
-
- # Show preprocessing metadata in a well-formatted caption
- meta_items = []
- # Only include document type in the list if actual preprocessing is applied
- has_active_preprocessing = (
- sidebar_options["preprocessing_options"].get("grayscale", False) or
- sidebar_options["preprocessing_options"].get("denoise", False) or
- sidebar_options["preprocessing_options"].get("contrast", 0) != 0 or
- sidebar_options["preprocessing_options"].get("rotation", 0) != 0
- )
-
- # Only show document type if there's actual preprocessing being applied
- if has_active_preprocessing and sidebar_options["preprocessing_options"].get("document_type", "standard") != "standard":
- meta_items.append(f"Document type ({sidebar_options['preprocessing_options']['document_type']})")
- if sidebar_options["preprocessing_options"].get("grayscale", False):
- meta_items.append("Grayscale")
- if sidebar_options["preprocessing_options"].get("denoise", False):
- meta_items.append("Denoise")
- if sidebar_options["preprocessing_options"].get("contrast", 0) != 0:
- meta_items.append(f"Contrast ({sidebar_options['preprocessing_options']['contrast']})")
- if sidebar_options["preprocessing_options"].get("rotation", 0) != 0:
- meta_items.append(f"Rotation ({sidebar_options['preprocessing_options']['rotation']}°)")
-
- # Only show "Applied:" if there are actual preprocessing steps
- if meta_items:
- meta_text = "Applied: " + ", ".join(meta_items)
- st.caption(meta_text)
- except Exception as e:
- st.error(f"Error in preprocessing: {str(e)}")
- st.info("Try using grayscale preprocessing for PNG images with transparency")
+ # Check if file exceeds API limits (50 MB)
+ if file_size_mb > 50:
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
+ return {
+ "file_name": uploaded_file.name,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "confidence_score": 0.0,
+ "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+ "ocr_contents": {
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+ "partial_text": "Document could not be processed due to size limitations."
+ }
+ }
- # Container for success message (will be filled after processing)
- metadata_placeholder = st.empty()
-
- # Check if this is an auto-processing situation
- auto_processing = st.session_state.auto_process_sample and not st.session_state.processed_document_active
+ # Update progress
+ progress_bar.progress(40)
+ status_text.text("Processing document with OCR...")
- # Show a message if auto-processing is happening
- auto_processing_message = st.empty()
- if auto_processing:
- auto_processing_message.info("Automatically processing sample document...")
+ # Process the file with file size information for automatic page limiting
+ # Make sure we're using the latest mistral-ocr model
+ # See https://docs.mistral.ai/capabilities/document/ for more info
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
- # Determine if we should process the document
- # Either process button was clicked OR auto-processing is happening
- should_process = process_button or auto_processing
-
- if should_process:
- # Reset auto-process flag to avoid processing on next rerun
- if st.session_state.auto_process_sample:
- st.session_state.auto_process_sample = False
-
- # Move the progress indicator reference to just below the button
- progress_reporter = ProgressReporter(progress_placeholder).setup()
+ # Complete progress
+ progress_bar.progress(100)
+ status_text.empty()
- try:
- # Process the document, capturing both result and temp file paths
- # Modified to pass existing temp_file_paths to avoid resource leaks
- existing_temp_paths = []
- if 'temp_file_paths' in st.session_state:
- existing_temp_paths = st.session_state.temp_file_paths
-
- result = process_file(
- uploaded_file=uploaded_file,
- use_vision=sidebar_options["use_vision"],
- preprocessing_options=sidebar_options["preprocessing_options"],
- progress_reporter=progress_reporter,
- pdf_dpi=sidebar_options.get("pdf_dpi", 150),
- max_pages=sidebar_options.get("max_pages", 3),
- pdf_rotation=sidebar_options.get("pdf_rotation", 0),
- custom_prompt=sidebar_options.get("custom_prompt", ""),
- perf_mode=sidebar_options.get("perf_mode", "Quality"),
- use_segmentation=sidebar_options.get("use_segmentation", False)
- )
-
- # Ensure temp_file_paths in session state is updated with any new paths
- # This is critical for proper resource cleanup when document is closed
- if 'has_images' in result and result['has_images']:
- logger.info("Document has images, ensuring temp files are tracked")
- if 'temp_file_paths' not in st.session_state:
- st.session_state.temp_file_paths = []
-
- # Handle text-only OCR results (like the Milgram flier)
- if ('ocr_contents' in result and
- 'raw_text' in result['ocr_contents'] and
- len(result['ocr_contents']) <= 2 and # Only raw_text and possibly one other field
- 'has_images' not in result):
- logger.info("Text-only OCR detected, handling as special case")
- # Ensure raw_text is properly formatted as markdown
- raw_text = result['ocr_contents']['raw_text']
- # If we don't have other structured content, set a placeholder title
- if 'title' not in result['ocr_contents']:
- result['ocr_contents']['title'] = "Document Text"
-
- # Display success message at the top of results, before any previews
- with left_col:
- # First show the success message (full width)
- st.success("**Document processed successfully**")
-
- # Then show the close button (also full width, positioned to left)
- st.button("Close Document",
- key="close_document_btn",
- type="secondary",
- on_click=close_document)
-
- # Add a small spacer
- st.markdown("", unsafe_allow_html=True)
-
- # Display results
- display_results(result, right_col, sidebar_options.get("custom_prompt", ""))
-
- # Set processed_document_active to True when a new document is processed
- st.session_state.processed_document_active = True
-
- # Clear the auto-processing message
- auto_processing_message.empty()
-
- # Store information about this processed file to track when new files are uploaded
- if uploaded_file is not None:
- st.session_state.last_processed_file = current_file_identifier
-
- # Store the result in the previous results list
- # Add timestamp to result for history tracking
- result_copy = result.copy()
- result_copy['timestamp'] = format_timestamp()
-
- # Store if this was a sample document
- if 'is_sample_document' in st.session_state and st.session_state.is_sample_document:
- result_copy['sample_document'] = True
-
- # Add to session state, keeping the most recent 20 results
- st.session_state.previous_results.insert(0, result_copy)
- if len(st.session_state.previous_results) > 20:
- st.session_state.previous_results = st.session_state.previous_results[:20]
-
- except Exception as e:
- st.error(f"Error processing document: {str(e)}")
-
- # Log the error
- import logging
- logging.error(f"Document processing error: {str(e)}", exc_info=True)
+ return result
+ except Exception as e:
+ progress_bar.progress(100)
+ status_text.empty()
+ st.error(f"Error during processing: {str(e)}")
+ raise
+ finally:
+ # Clean up the temporary file
+ if os.path.exists(temp_path):
+ os.unlink(temp_path)
+
+# App title and description
+st.title("Historical Document OCR")
+st.subheader("Powered by Mistral AI")
+
+# Create main layout with tabs and columns
+main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
-def main():
- """Main application function"""
- # Initialize session state
- init_session_state()
+with main_tab1:
+ # Create a two-column layout for file upload and preview
+ upload_col, preview_col = st.columns([1, 1])
- # Handle any required cleanup at the start of execution
- # CRITICAL: This two-phase state cleanup pattern is essential for Streamlit's execution model.
- # When close_clicked is True, we need to restart the app's execution with a clean slate.
- # DO NOT REMOVE OR MODIFY this pattern as it ensures proper UI cleanup.
- if st.session_state.get('close_clicked', False):
- # Reset the flag - cleanup has been handled
- st.session_state.close_clicked = False
- # Don't do anything else in this run - force a clean restart
- st.rerun()
+ # File uploader in the left column
+ with upload_col:
+ st.markdown("""
+ Upload an image or PDF file to get started.
- # Initialize new flag for redirecting to processing tab
- if 'redirect_to_processing' not in st.session_state:
- st.session_state.redirect_to_processing = False
+ Using the latest `mistral-ocr-latest` model for advanced document understanding.
+ """)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 50MB per file")
+
+# Sidebar with options
+with st.sidebar:
+ st.header("Options")
+
+ # Model options
+ st.subheader("Model Settings")
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="For image files, use the vision model for improved analysis (may be slower)")
- # Apply custom CSS
- from ui.layout import load_css
- load_css()
+ # Image preprocessing options (collapsible)
+ st.subheader("Image Preprocessing")
+ with st.expander("Preprocessing Options"):
+ preprocessing_options = {}
+ preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
+ help="Convert image to grayscale before OCR")
+ preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
+ help="Apply adaptive thresholding to enhance text")
+ preprocessing_options["denoise"] = st.checkbox("Denoise Image",
+ help="Remove noise from the image")
+ preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
+ help="Adjust image contrast (-5 to +5)")
- # Create sidebar options
- sidebar_options = create_sidebar_options()
+ # PDF options (collapsible)
+ st.subheader("PDF Options")
+ with st.expander("PDF Settings"):
+ pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
+ help="Higher DPI gives better quality but slower processing")
+ max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
+ help="Limit number of pages to process")
+
+# About tab content
+with main_tab2:
+ st.markdown("""
+ ### About This Application
- # Create main layout with tabs - simpler, more compact approach
- tab_names = ["Document Processing", "Sample Documents", "Learn More"]
- main_tab1, main_tab2, main_tab3 = st.tabs(tab_names)
+ This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
- with main_tab1:
- # Create a two-column layout for file upload and results with minimal padding
- st.markdown('', unsafe_allow_html=True)
- # Using a 2:3 column ratio gives more space to the results column
- left_col, right_col = st.columns([2, 3])
+ It can process:
+ - Image files (jpg, png, etc.)
+ - PDF documents (multi-page support)
+
+ The extracted content is processed into structured data based on the document type, combining:
+ - Text extraction with `mistral-ocr-latest`
+ - Analysis with language models
+ - Layout preservation with images
+
+ View results in three formats:
+ - Structured HTML view
+ - Raw JSON (for developers)
+ - Markdown with images (preserves document layout)
+
+ **New Features:**
+ - Image preprocessing for better OCR quality
+ - PDF resolution and page controls
+ - Progress tracking during processing
+ """)
+
+with main_tab1:
+ if uploaded_file is not None:
+ # Check file size (cap at 50MB)
+ file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
- with left_col:
- # Create file uploader
- uploaded_file = create_file_uploader()
-
- # If a real file is uploaded, clear any sample document
- if uploaded_file is not None and 'sample_document' in st.session_state:
- st.session_state.sample_document = None
- st.session_state.is_sample_document = False
+ if file_size_mb > 50:
+ with upload_col:
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
+ st.stop()
+
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ # Display document preview in preview column
+ with preview_col:
+ st.subheader("Document Preview")
+ if file_ext == ".pdf":
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+
+ if images:
+ # Convert PIL image to bytes for Streamlit
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+
+ # Display the PDF preview
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ # Simply show the file name without an error message
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ st.info("Click 'Process Document' to analyze the content.")
+ else:
+ st.image(uploaded_file, use_container_width=True)
+
+ # Add image preprocessing preview in a collapsible section if needed
+ if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
+ with st.expander("Image Preprocessing Preview"):
+ preview_cols = st.columns(2)
- # Check if we have a sample document loaded (only if no real file uploaded)
- elif ('sample_document' in st.session_state and
- st.session_state.sample_document is not None):
+ with preview_cols[0]:
+ st.markdown("**Original Image**")
+ st.image(uploaded_file, use_container_width=True)
- # Use the sample document instead of the uploaded file
- uploaded_file = st.session_state.sample_document
+ with preview_cols[1]:
+ st.markdown("**Preprocessed Image**")
+ try:
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
+ except Exception as e:
+ st.error(f"Error in preprocessing: {str(e)}")
+
+ # Process button - flush left with similar padding as file browser
+ with upload_col:
+ process_button = st.button("Process Document", use_container_width=True)
+
+ # Results section
+ if process_button:
+ try:
+ # Get max_pages or default if not available
+ max_pages_value = max_pages if 'max_pages' in locals() else None
- # Just reset the sample document loading flags after it's been used
- if st.session_state.sample_just_loaded:
- st.session_state.sample_just_loaded = False
- st.session_state.sample_document_processed = True
- st.session_state.auto_process_sample = True
-
- # Only process document if available
- if uploaded_file is not None:
- process_document(uploaded_file, left_col, right_col, sidebar_options)
-
- with main_tab2:
- # Sample Documents tab
+ # Call process_file with all options
+ result = process_file(uploaded_file, use_vision, preprocessing_options)
+
+ # Create results tabs for better organization
+ results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
+
+ with results_tab1:
+ # Create two columns for metadata and content
+ meta_col, content_col = st.columns([1, 2])
+
+ with meta_col:
+ st.subheader("Document Metadata")
+ st.success("**Document processed successfully**")
+
+ # Display file info
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+
+ # Display info if only limited pages were processed
+ if 'limited_pages' in result:
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
+
+ # Display languages if available
+ if 'languages' in result:
+ languages = [lang for lang in result['languages'] if lang is not None]
+ if languages:
+ st.write(f"**Languages:** {', '.join(languages)}")
+
+ # Confidence score if available
+ if 'confidence_score' in result:
+ confidence = result['confidence_score']
+ st.write(f"**OCR Confidence:** {confidence:.1%}")
+
+ # Display topics if available
+ if 'topics' in result and result['topics']:
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
+
+ with content_col:
+ st.subheader("Document Contents")
+ if 'ocr_contents' in result:
+ # Check if there are images in the OCR result
+ has_images = False
+ if 'raw_response' in result:
+ try:
+ has_images = any(page.images for page in result['raw_response'].pages)
+ except Exception:
+ has_images = False
+
+ # Create tabs for different views
+ if has_images:
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
+ else:
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
+
+ with view_tab1:
+ # Display in a more user-friendly format based on the content structure
+ html_content = ""
+ if isinstance(result['ocr_contents'], dict):
+ for section, content in result['ocr_contents'].items():
+ if content: # Only display non-empty sections
+ section_title = f"
"
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
+ for k, v in content.items():
+ html_dict += f"
{k}
{v}
"
+ st.markdown(f"**{k}:** {v}")
+ html_dict += "
"
+ html_content += html_dict
+
+ # Add download button in a smaller section
+ with st.expander("Export Content"):
+ # Alternative download button
+ html_bytes = html_content.encode()
+ st.download_button(
+ label="Download as HTML",
+ data=html_bytes,
+ file_name="document_content.html",
+ mime="text/html"
+ )
+
+ with view_tab2:
+ # Show the raw JSON for developers
+ st.json(result)
+
+ if has_images:
+ with view_tab3:
+ # Show loading indicator while preparing images
+ with st.spinner("Preparing document with embedded images..."):
+ try:
+ # Import function
+ try:
+ from ocr_utils import get_combined_markdown
+ except ImportError:
+ st.error("Required module ocr_utils not found.")
+ st.stop()
+
+ # Check if raw_response is available
+ if 'raw_response' not in result:
+ st.warning("Raw OCR response not available. Cannot display images.")
+ st.stop()
+
+ # Validate the raw_response structure before processing
+ if not hasattr(result['raw_response'], 'pages'):
+ st.warning("Invalid OCR response format. Cannot display images.")
+ st.stop()
+
+ # Get the combined markdown with images
+ # Set a flag to compress images if needed
+ compress_images = True
+ max_image_width = 800 # Maximum width for images
+
+ try:
+ # First try to get combined markdown with compressed images
+ if compress_images and hasattr(result['raw_response'], 'pages'):
+ from ocr_utils import get_combined_markdown_compressed
+ combined_markdown = get_combined_markdown_compressed(
+ result['raw_response'],
+ max_width=max_image_width,
+ quality=85
+ )
+ else:
+ # Fall back to regular method if compression not available
+ combined_markdown = get_combined_markdown(result['raw_response'])
+ except (ImportError, AttributeError):
+ # Fall back to regular method
+ combined_markdown = get_combined_markdown(result['raw_response'])
+
+ if not combined_markdown or combined_markdown.strip() == "":
+ st.warning("No image content found in the document.")
+ st.stop()
+
+ # Check if there are many images that might cause loading issues
+ image_count = sum(len(page.images) for page in result['raw_response'].pages if hasattr(page, 'images'))
+
+ # Add warning for image-heavy documents
+ if image_count > 10:
+ st.warning(f"This document contains {image_count} images. Rendering may take longer than usual.")
+
+ # Add CSS to ensure proper spacing and handling of text and images
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # For very image-heavy documents, show images in a paginated way
+ if image_count > 20:
+ # Show image content in a paginated way
+ st.write("Document contains many images. Showing in a paginated format:")
+
+ # Split the combined markdown by page separators
+ pages = combined_markdown.split("---")
+
+ # Create a page selector
+ page_num = st.selectbox("Select page to view:",
+ options=list(range(1, len(pages)+1)),
+ index=0)
+
+ # Display only the selected page
+ st.markdown(f"""
+
+ {pages[page_num-1]}
+
+ """, unsafe_allow_html=True)
+
+ # Add note about pagination
+ st.info(f"Showing page {page_num} of {len(pages)}. Select a different page from the dropdown above.")
+ else:
+ # Wrap the markdown in a div with the class for styling
+ st.markdown(f"""
+
+ {combined_markdown}
+
+ """, unsafe_allow_html=True)
+
+ # Add a download button for the combined content
+ st.download_button(
+ label="Download with Images (HTML)",
+ data=f"""
+
+
+
+
+
+ {combined_markdown}
+
+
+ """,
+ file_name="document_with_images.html",
+ mime="text/html"
+ )
+
+ except Exception as e:
+ st.error(f"Could not display document with images: {str(e)}")
+ st.info("Try refreshing or processing the document again.")
+ else:
+ st.error("No OCR content was extracted from the document.")
+
+ with results_tab2:
+ st.subheader("Raw Processing Results")
+ st.json(result)
+
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+ else:
+ # Display sample images in the main area when no file is uploaded
+ st.info("Upload a document to get started using the file uploader above.")
+
+ # Show example images in a grid
+ st.subheader("Example Documents")
- # Show redirect message if a sample was just loaded
- if st.session_state.get('redirect_to_processing', False):
- st.success("**Sample document loaded!** Please switch to the **Document Processing** tab to view and process it.")
- # Clear the flag after showing the message
- st.session_state.redirect_to_processing = False
+ # Add a sample images container
+ with st.container():
+ # Find sample images from the input directory to display
+ input_dir = Path(__file__).parent / "input"
+ sample_images = []
+ if input_dir.exists():
+ # Find valid jpg files (with size > 50KB to avoid placeholders)
+ sample_images = [
+ path for path in input_dir.glob("*.jpg")
+ if path.stat().st_size > 50000
+ ][:3] # Limit to 3 samples
- show_example_documents()
-
- # Previous results tab temporarily removed
-
- with main_tab3:
- # About tab
- display_about_tab()
-
-# Run the application
-if __name__ == "__main__":
- main()
+ if sample_images:
+ columns = st.columns(3)
+ for i, img_path in enumerate(sample_images):
+ with columns[i % 3]:
+ try:
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
+ except Exception as e:
+ st.error(f"Error loading image {img_path.name}: {str(e)}")
\ No newline at end of file
diff --git a/backup/app.py b/backup/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..d895b2e927b0769b27073abf1adb99e4261a7795
--- /dev/null
+++ b/backup/app.py
@@ -0,0 +1,535 @@
+import os
+import streamlit as st
+import json
+import sys
+import time
+from pathlib import Path
+import tempfile
+import io
+from pdf2image import convert_from_bytes
+from PIL import Image, ImageEnhance, ImageFilter
+import cv2
+import numpy as np
+
+# Import the StructuredOCR class and config from the local files
+from structured_ocr import StructuredOCR
+from config import MISTRAL_API_KEY
+
+# Set page configuration
+st.set_page_config(
+ page_title="Historical OCR",
+ page_icon="🚀",
+ layout="wide",
+ initial_sidebar_state="expanded"
+)
+
+# Enable caching for expensive operations
+@st.cache_data(ttl=3600, show_spinner=False)
+def convert_pdf_to_images(pdf_bytes, dpi=150):
+ """Convert PDF bytes to a list of images with caching"""
+ try:
+ return convert_from_bytes(pdf_bytes, dpi=dpi)
+ except Exception as e:
+ st.error(f"Error converting PDF: {str(e)}")
+ return []
+
+@st.cache_data(ttl=3600, show_spinner=False)
+def preprocess_image(image_bytes, preprocessing_options):
+ """Preprocess image with selected options"""
+ # Convert bytes to OpenCV format
+ image = Image.open(io.BytesIO(image_bytes))
+ img_array = np.array(image)
+
+ # Apply preprocessing based on selected options
+ if preprocessing_options.get("grayscale", False):
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
+
+ if preprocessing_options.get("contrast", 0) != 0:
+ contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
+ image = Image.fromarray(img_array)
+ enhancer = ImageEnhance.Contrast(image)
+ image = enhancer.enhance(contrast_factor)
+ img_array = np.array(image)
+
+ if preprocessing_options.get("denoise", False):
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
+
+ if preprocessing_options.get("threshold", False):
+ # Convert to grayscale if not already
+ if len(img_array.shape) == 3:
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
+ else:
+ gray = img_array
+ # Apply adaptive threshold
+ binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+ cv2.THRESH_BINARY, 11, 2)
+ # Convert back to RGB
+ img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
+
+ # Convert back to PIL Image
+ processed_image = Image.fromarray(img_array)
+
+ # Convert to bytes
+ byte_io = io.BytesIO()
+ processed_image.save(byte_io, format='PNG')
+ byte_io.seek(0)
+
+ return byte_io.getvalue()
+
+# Define functions
+def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
+ """Process the uploaded file and return the OCR results
+
+ Args:
+ uploaded_file: The uploaded file to process
+ use_vision: Whether to use vision model
+ preprocessing_options: Dictionary of preprocessing options
+ """
+ if preprocessing_options is None:
+ preprocessing_options = {}
+
+ # Show progress indicator
+ progress_bar = st.progress(0)
+ status_text = st.empty()
+ status_text.text("Preparing file for processing...")
+
+ # Save the uploaded file to a temporary file
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+ tmp.write(uploaded_file.getvalue())
+ temp_path = tmp.name
+
+ try:
+ # Check if API key is available
+ if not MISTRAL_API_KEY:
+ # Return dummy data if no API key
+ progress_bar.progress(100)
+ status_text.empty()
+ return {
+ "file_name": uploaded_file.name,
+ "topics": ["Sample Document"],
+ "languages": ["English"],
+ "ocr_contents": {
+ "title": "Sample Document",
+ "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
+ }
+ }
+
+ # Update progress
+ progress_bar.progress(20)
+ status_text.text("Initializing OCR processor...")
+
+ # Initialize OCR processor
+ processor = StructuredOCR()
+
+ # Determine file type from extension
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ file_type = "pdf" if file_ext == ".pdf" else "image"
+
+ # Apply preprocessing if needed
+ if any(preprocessing_options.values()) and file_type == "image":
+ status_text.text("Applying image preprocessing...")
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+
+ # Save processed image to temp file
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
+ proc_tmp.write(processed_bytes)
+ temp_path = proc_tmp.name
+
+ # Get file size in MB
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+
+ # Check if file exceeds API limits (50 MB)
+ if file_size_mb > 50:
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
+ return {
+ "file_name": uploaded_file.name,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "confidence_score": 0.0,
+ "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+ "ocr_contents": {
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+ "partial_text": "Document could not be processed due to size limitations."
+ }
+ }
+
+ # Update progress
+ progress_bar.progress(40)
+ status_text.text("Processing document with OCR...")
+
+ # Process the file with file size information for automatic page limiting
+ # Make sure we're using the latest mistral-ocr model
+ # See https://docs.mistral.ai/capabilities/document/ for more info
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
+
+ # Complete progress
+ progress_bar.progress(100)
+ status_text.empty()
+
+ return result
+ except Exception as e:
+ progress_bar.progress(100)
+ status_text.empty()
+ st.error(f"Error during processing: {str(e)}")
+ raise
+ finally:
+ # Clean up the temporary file
+ if os.path.exists(temp_path):
+ os.unlink(temp_path)
+
+# App title and description
+st.title("Historical Document OCR")
+st.subheader("Powered by Mistral AI")
+
+# Create main layout with tabs and columns
+main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
+
+with main_tab1:
+ # Create a two-column layout for file upload and preview
+ upload_col, preview_col = st.columns([1, 1])
+
+ # File uploader in the left column
+ with upload_col:
+ st.markdown("""
+ Upload an image or PDF file to get started.
+
+ Using the latest `mistral-ocr-latest` model for advanced document understanding.
+ """)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
+
+# Sidebar with options
+with st.sidebar:
+ st.header("Options")
+
+ # Model options
+ st.subheader("Model Settings")
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="For image files, use the vision model for improved analysis (may be slower)")
+
+ # Image preprocessing options (collapsible)
+ st.subheader("Image Preprocessing")
+ with st.expander("Preprocessing Options"):
+ preprocessing_options = {}
+ preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
+ help="Convert image to grayscale before OCR")
+ preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
+ help="Apply adaptive thresholding to enhance text")
+ preprocessing_options["denoise"] = st.checkbox("Denoise Image",
+ help="Remove noise from the image")
+ preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
+ help="Adjust image contrast (-5 to +5)")
+
+ # PDF options (collapsible)
+ st.subheader("PDF Options")
+ with st.expander("PDF Settings"):
+ pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
+ help="Higher DPI gives better quality but slower processing")
+ max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
+ help="Limit number of pages to process")
+
+# About tab content
+with main_tab2:
+ st.markdown("""
+ ### About This Application
+
+ This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
+
+ It can process:
+ - Image files (jpg, png, etc.)
+ - PDF documents (multi-page support)
+
+ The extracted content is processed into structured data based on the document type, combining:
+ - Text extraction with `mistral-ocr-latest`
+ - Analysis with language models
+ - Layout preservation with images
+
+ View results in three formats:
+ - Structured HTML view
+ - Raw JSON (for developers)
+ - Markdown with images (preserves document layout)
+
+ **New Features:**
+ - Image preprocessing for better OCR quality
+ - PDF resolution and page controls
+ - Progress tracking during processing
+ """)
+
+with main_tab1:
+ if uploaded_file is not None:
+ # Check file size (cap at 50MB)
+ file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
+
+ if file_size_mb > 50:
+ with upload_col:
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
+ st.stop()
+
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ # Display document preview in preview column
+ with preview_col:
+ st.subheader("Document Preview")
+ if file_ext == ".pdf":
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+
+ if images:
+ # Convert PIL image to bytes for Streamlit
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+
+ # Display the PDF preview
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ # Simply show the file name without an error message
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ st.info("Click 'Process Document' to analyze the content.")
+ else:
+ st.image(uploaded_file, use_container_width=True)
+
+ # Add image preprocessing preview in a collapsible section if needed
+ if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
+ with st.expander("Image Preprocessing Preview"):
+ preview_cols = st.columns(2)
+
+ with preview_cols[0]:
+ st.markdown("**Original Image**")
+ st.image(uploaded_file, use_container_width=True)
+
+ with preview_cols[1]:
+ st.markdown("**Preprocessed Image**")
+ try:
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
+ except Exception as e:
+ st.error(f"Error in preprocessing: {str(e)}")
+
+ # Process button - flush left with similar padding as file browser
+ with upload_col:
+ process_button = st.button("Process Document", use_container_width=True)
+
+ # Results section
+ if process_button:
+ try:
+ # Get max_pages or default if not available
+ max_pages_value = max_pages if 'max_pages' in locals() else None
+
+ # Call process_file with all options
+ result = process_file(uploaded_file, use_vision, preprocessing_options)
+
+ # Create results tabs for better organization
+ results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
+
+ with results_tab1:
+ # Create two columns for metadata and content
+ meta_col, content_col = st.columns([1, 2])
+
+ with meta_col:
+ st.subheader("Document Metadata")
+ st.success("**Document processed successfully**")
+
+ # Display file info
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+
+ # Display info if only limited pages were processed
+ if 'limited_pages' in result:
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
+
+ # Display languages if available
+ if 'languages' in result:
+ languages = [lang for lang in result['languages'] if lang is not None]
+ if languages:
+ st.write(f"**Languages:** {', '.join(languages)}")
+
+ # Confidence score if available
+ if 'confidence_score' in result:
+ confidence = result['confidence_score']
+ st.write(f"**OCR Confidence:** {confidence:.1%}")
+
+ # Display topics if available
+ if 'topics' in result and result['topics']:
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
+
+ with content_col:
+ st.subheader("Document Contents")
+ if 'ocr_contents' in result:
+ # Check if there are images in the OCR result
+ has_images = False
+ if 'raw_response' in result:
+ try:
+ has_images = any(page.images for page in result['raw_response'].pages)
+ except Exception:
+ has_images = False
+
+ # Create tabs for different views
+ if has_images:
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
+ else:
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
+
+ with view_tab1:
+ # Display in a more user-friendly format based on the content structure
+ html_content = ""
+ if isinstance(result['ocr_contents'], dict):
+ for section, content in result['ocr_contents'].items():
+ if content: # Only display non-empty sections
+ section_title = f"
"
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
+ for k, v in content.items():
+ html_dict += f"
{k}
{v}
"
+ st.markdown(f"**{k}:** {v}")
+ html_dict += "
"
+ html_content += html_dict
+
+ # Add download button in a smaller section
+ with st.expander("Export Content"):
+ # Alternative download button
+ html_bytes = html_content.encode()
+ st.download_button(
+ label="Download as HTML",
+ data=html_bytes,
+ file_name="document_content.html",
+ mime="text/html"
+ )
+
+ with view_tab2:
+ # Show the raw JSON for developers
+ st.json(result)
+
+ if has_images:
+ with view_tab3:
+ # Show loading indicator while preparing images
+ with st.spinner("Preparing document with embedded images..."):
+ try:
+ # Import function
+ try:
+ from ocr_utils import get_combined_markdown
+ except ImportError:
+ st.error("Required module ocr_utils not found.")
+ st.stop()
+
+ # Check if raw_response is available
+ if 'raw_response' not in result:
+ st.warning("Raw OCR response not available. Cannot display images.")
+ st.stop()
+
+ # Validate the raw_response structure before processing
+ if not hasattr(result['raw_response'], 'pages'):
+ st.warning("Invalid OCR response format. Cannot display images.")
+ st.stop()
+
+ # Get the combined markdown with images
+ combined_markdown = get_combined_markdown(result['raw_response'])
+
+ if not combined_markdown or combined_markdown.strip() == "":
+ st.warning("No image content found in the document.")
+ st.stop()
+
+ # Add CSS to ensure proper spacing and handling of text and images
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Wrap the markdown in a div with the class for styling
+ st.markdown(f"""
+
+ {combined_markdown}
+
+ """, unsafe_allow_html=True)
+
+ # Add a download button for the combined content
+ st.download_button(
+ label="Download with Images (HTML)",
+ data=f"""
+
+
+
+
+
+ {combined_markdown}
+
+
+ """,
+ file_name="document_with_images.html",
+ mime="text/html"
+ )
+
+ except Exception as e:
+ st.error(f"Could not display document with images: {str(e)}")
+ st.info("Try refreshing or processing the document again.")
+ else:
+ st.error("No OCR content was extracted from the document.")
+
+ with results_tab2:
+ st.subheader("Raw Processing Results")
+ st.json(result)
+
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+ else:
+ # Display sample images in the main area when no file is uploaded
+ st.info("Upload a document to get started using the file uploader above.")
+
+ # Show example images in a grid
+ st.subheader("Example Documents")
+
+ # Add a sample images container
+ with st.container():
+ # Find sample images from the input directory to display
+ input_dir = Path(__file__).parent / "input"
+ sample_images = []
+ if input_dir.exists():
+ sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
+
+ if sample_images:
+ columns = st.columns(3)
+ for i, img_path in enumerate(sample_images):
+ with columns[i % 3]:
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
\ No newline at end of file
diff --git a/backup/config.py b/backup/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f999828a8664fad82e03b043f59bfff44e0b0b06
--- /dev/null
+++ b/backup/config.py
@@ -0,0 +1,17 @@
+# config.py
+"""
+Configuration file for Mistral OCR processing.
+Contains API key and other settings.
+"""
+import os
+
+# Your Mistral API key - get from Hugging Face secrets or environment variable
+# The priority order is: HF_SPACES environment var > regular environment var > empty string
+# Note: No default API key is provided for security reasons
+MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
+ os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
+
+# Model settings
+OCR_MODEL = "mistral-ocr-latest"
+TEXT_MODEL = "ministral-8b-latest"
+VISION_MODEL = "pixtral-12b-latest"
\ No newline at end of file
diff --git a/input/magician-or-bottle-cungerer.jpg b/backup/input/The Magician, or Bottle Cungerer.jpeg
similarity index 100%
rename from input/magician-or-bottle-cungerer.jpg
rename to backup/input/The Magician, or Bottle Cungerer.jpeg
diff --git a/input/baldwin-15th-north.jpg b/backup/input/baldwin-letter-1.jpg
similarity index 100%
rename from input/baldwin-15th-north.jpg
rename to backup/input/baldwin-letter-1.jpg
diff --git a/backup/input/baldwin-letter-2.jpg b/backup/input/baldwin-letter-2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b19ec5985abd7d36aef556b7427d2524c54d5d13
--- /dev/null
+++ b/backup/input/baldwin-letter-2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
+size 114136
diff --git a/backup/input/flier.png b/backup/input/flier.png
new file mode 100644
index 0000000000000000000000000000000000000000..a02e7743490614e3f9884a88aa9ad15214609a34
Binary files /dev/null and b/backup/input/flier.png differ
diff --git a/backup/input/letter-1.jpg b/backup/input/letter-1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4e6b833fc8ab797f608dfe7c4e92642ca8b773d3
--- /dev/null
+++ b/backup/input/letter-1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
+size 135244
diff --git a/backup/input/letter-2.jpg b/backup/input/letter-2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b19ec5985abd7d36aef556b7427d2524c54d5d13
--- /dev/null
+++ b/backup/input/letter-2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
+size 114136
diff --git a/backup/input/letter-3.jpg b/backup/input/letter-3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fa0cc25dd4affd353900e34e9e986d6fa435ee8e
--- /dev/null
+++ b/backup/input/letter-3.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
+size 230837
diff --git a/backup/input/magellan-travels.jpg b/backup/input/magellan-travels.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..82e136cd1601c2c3acad8c7599339f84f335f469
--- /dev/null
+++ b/backup/input/magellan-travels.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
+size 283156
diff --git a/backup/input/menu.pdf b/backup/input/menu.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..276af61cf2f4ac361eaa86b3c430c527b1e59230
--- /dev/null
+++ b/backup/input/menu.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
+size 2554815
diff --git a/backup/input/recipe.jpg b/backup/input/recipe.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1701f0ba9840b4b978a4bae4d14c2780516c5f26
Binary files /dev/null and b/backup/input/recipe.jpg differ
diff --git a/backup/ocr_utils.py b/backup/ocr_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c8a006402aa612fa1929a60c3754c5d8ee69c43
--- /dev/null
+++ b/backup/ocr_utils.py
@@ -0,0 +1,136 @@
+"""
+Utility functions for OCR processing with Mistral AI.
+Contains helper functions for working with OCR responses and image handling.
+"""
+
+import json
+import base64
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+
+def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
+ """
+ Replace image placeholders in markdown with base64-encoded images.
+
+ Args:
+ markdown_str: Markdown text containing image placeholders
+ images_dict: Dictionary mapping image IDs to base64 strings
+
+ Returns:
+ Markdown text with images replaced by base64 data
+ """
+ for img_name, base64_str in images_dict.items():
+ markdown_str = markdown_str.replace(
+ f"", f""
+ )
+ return markdown_str
+
+def get_combined_markdown(ocr_response) -> str:
+ """
+ Combine OCR text and images into a single markdown document.
+ Ensures proper spacing between text and images.
+
+ Args:
+ ocr_response: Response from OCR processing containing text and images
+ See https://docs.mistral.ai/capabilities/document/ for API reference
+
+ Returns:
+ Combined markdown string with embedded images
+ """
+ markdowns: list[str] = []
+ # Extract images from page
+ for page in ocr_response.pages:
+ image_data = {}
+ for img in page.images:
+ image_data[img.id] = img.image_base64
+
+ # Replace image placeholders with actual images
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
+
+ # Ensure proper spacing between paragraphs and images
+ # Add extra newlines between paragraphs to improve rendering
+ page_markdown = page_markdown.replace("\n", "\n\n")
+
+ # Add page separator for multi-page documents
+ markdowns.append(page_markdown)
+
+ # Join pages with clear separators for multi-page documents
+ return "\n\n---\n\n".join(markdowns)
+
+def encode_image_for_api(image_path: Union[str, Path]) -> str:
+ """
+ Encode an image as base64 for API use.
+
+ Args:
+ image_path: Path to the image file
+
+ Returns:
+ Base64 data URL for the image
+ """
+ # Convert to Path object if string
+ image_file = Path(image_path) if isinstance(image_path, str) else image_path
+
+ # Verify image exists
+ if not image_file.is_file():
+ raise FileNotFoundError(f"Image file not found: {image_file}")
+
+ # Encode image as base64
+ encoded = base64.b64encode(image_file.read_bytes()).decode()
+ return f"data:image/jpeg;base64,{encoded}"
+
+def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
+ """
+ Process an image with OCR and return the response.
+
+ Args:
+ client: Mistral AI client
+ image_path: Path to the image file
+ model: OCR model to use
+
+ Returns:
+ OCR response object
+ """
+ # Encode image as base64
+ base64_data_url = encode_image_for_api(image_path)
+
+ # Process image with OCR
+ image_response = client.ocr.process(
+ document=ImageURLChunk(image_url=base64_data_url),
+ model=model
+ )
+
+ return image_response
+
+def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
+ """
+ Convert OCR response to a formatted JSON string.
+
+ Args:
+ ocr_response: OCR response object
+ indent: Indentation level for JSON formatting
+
+ Returns:
+ Formatted JSON string
+ """
+ # Convert response to JSON
+ response_dict = json.loads(ocr_response.model_dump_json())
+ return json.dumps(response_dict, indent=indent)
+
+# For display in notebooks
+try:
+ from IPython.display import Markdown, display
+
+ def display_ocr_with_images(ocr_response):
+ """
+ Display OCR response with embedded images in IPython environments.
+
+ Args:
+ ocr_response: OCR response object
+ """
+ combined_markdown = get_combined_markdown(ocr_response)
+ display(Markdown(combined_markdown))
+except ImportError:
+ # IPython not available
+ pass
\ No newline at end of file
diff --git a/backup/pdf_ocr.py b/backup/pdf_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..44ac2dec6ee5da63f03e3a7264a0c44125617acb
--- /dev/null
+++ b/backup/pdf_ocr.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+"""
+PDFOCR - Module for processing PDF files with OCR and extracting structured data.
+"""
+
+import json
+from pathlib import Path
+from structured_ocr import StructuredOCR
+
+class PDFOCR:
+ """Class for processing PDF files with OCR and extracting structured data."""
+
+ def __init__(self, api_key=None):
+ """Initialize the PDF OCR processor."""
+ self.processor = StructuredOCR(api_key=api_key)
+
+ def process_pdf(self, pdf_path, use_vision=True):
+ """
+ Process a PDF file with OCR and extract structured data.
+
+ Args:
+ pdf_path: Path to the PDF file
+ use_vision: Whether to use vision model for improved analysis
+
+ Returns:
+ Dictionary with structured OCR results
+ """
+ pdf_path = Path(pdf_path)
+ if not pdf_path.exists():
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+
+ return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
+
+ def save_json_output(self, pdf_path, output_path, use_vision=True):
+ """
+ Process a PDF file and save the structured output as JSON.
+
+ Args:
+ pdf_path: Path to the PDF file
+ output_path: Path where to save the JSON output
+ use_vision: Whether to use vision model for improved analysis
+
+ Returns:
+ Path to the saved JSON file
+ """
+ # Process the PDF
+ result = self.process_pdf(pdf_path, use_vision=use_vision)
+
+ # Save the result to JSON
+ output_path = Path(output_path)
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+
+ with open(output_path, 'w') as f:
+ json.dump(result, f, indent=2)
+
+ return output_path
+
+# For testing directly
+if __name__ == "__main__":
+ import sys
+
+ if len(sys.argv) < 2:
+ print("Usage: python pdf_ocr.py [output_path]")
+ sys.exit(1)
+
+ pdf_path = sys.argv[1]
+ output_path = sys.argv[2] if len(sys.argv) > 2 else None
+
+ processor = PDFOCR()
+
+ if output_path:
+ result_path = processor.save_json_output(pdf_path, output_path)
+ print(f"Results saved to: {result_path}")
+ else:
+ result = processor.process_pdf(pdf_path)
+ print(json.dumps(result, indent=2))
\ No newline at end of file
diff --git a/backup/requirements.txt b/backup/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd8bfec2f8a8e2662446f6ae690ed53227bf7b47
--- /dev/null
+++ b/backup/requirements.txt
@@ -0,0 +1,10 @@
+streamlit>=1.43.2
+mistralai>=0.0.7
+pydantic>=2.0.0
+pycountry>=23.12.11
+pillow>=10.0.0
+python-multipart>=0.0.6
+pdf2image>=1.17.0
+pytesseract>=0.3.10
+opencv-python-headless>=4.6.0
+numpy>=1.23.5
\ No newline at end of file
diff --git a/backup/structured_ocr.py b/backup/structured_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff80ba6010cd863225a93486561ee5dd12742c51
--- /dev/null
+++ b/backup/structured_ocr.py
@@ -0,0 +1,414 @@
+import os
+import sys
+import time
+from enum import Enum
+from pathlib import Path
+import json
+import base64
+import pycountry
+import logging
+from pydantic import BaseModel
+from mistralai import Mistral
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+# Import utilities for OCR processing
+try:
+ from ocr_utils import replace_images_in_markdown, get_combined_markdown
+except ImportError:
+ # Define fallback functions if module not found
+ def replace_images_in_markdown(markdown_str, images_dict):
+ for img_name, base64_str in images_dict.items():
+ markdown_str = markdown_str.replace(
+ f"", f""
+ )
+ return markdown_str
+
+ def get_combined_markdown(ocr_response):
+ markdowns = []
+ for page in ocr_response.pages:
+ image_data = {}
+ for img in page.images:
+ image_data[img.id] = img.image_base64
+ markdowns.append(replace_images_in_markdown(page.markdown, image_data))
+ return "\n\n".join(markdowns)
+
+# Import config directly (now local to historical-ocr)
+from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
+
+# Create language enum for structured output
+languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
+
+class LanguageMeta(Enum.__class__):
+ def __new__(metacls, cls, bases, classdict):
+ for code, name in languages.items():
+ classdict[name.upper().replace(' ', '_')] = name
+ return super().__new__(metacls, cls, bases, classdict)
+
+class Language(Enum, metaclass=LanguageMeta):
+ pass
+
+class StructuredOCRModel(BaseModel):
+ file_name: str
+ topics: list[str]
+ languages: list[Language]
+ ocr_contents: dict
+
+class StructuredOCR:
+ def __init__(self, api_key=None):
+ """Initialize the OCR processor with API key"""
+ self.api_key = api_key or MISTRAL_API_KEY
+ self.client = Mistral(api_key=self.api_key)
+
+ def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
+ """Process a file and return structured OCR results
+
+ Args:
+ file_path: Path to the file to process
+ file_type: 'pdf' or 'image' (will be auto-detected if None)
+ use_vision: Whether to use vision model for improved analysis
+ max_pages: Optional limit on number of pages to process
+ file_size_mb: Optional file size in MB (used for automatic page limiting)
+ custom_pages: Optional list of specific page numbers to process
+
+ Returns:
+ Dictionary with structured OCR results
+ """
+ # Convert file_path to Path object if it's a string
+ file_path = Path(file_path)
+
+ # Auto-detect file type if not provided
+ if file_type is None:
+ suffix = file_path.suffix.lower()
+ file_type = "pdf" if suffix == ".pdf" else "image"
+
+ # Get file size if not provided
+ if file_size_mb is None and file_path.exists():
+ file_size_mb = file_path.stat().st_size / (1024 * 1024) # Convert bytes to MB
+
+ # Check if file exceeds API limits (50 MB)
+ if file_size_mb and file_size_mb > 50:
+ logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
+ return {
+ "file_name": file_path.name,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "confidence_score": 0.0,
+ "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
+ "ocr_contents": {
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
+ "partial_text": "Document could not be processed due to size limitations."
+ }
+ }
+
+ # For PDF files, limit pages based on file size if no explicit limit is given
+ if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
+ if file_size_mb > 100: # Very large files
+ max_pages = 3
+ elif file_size_mb > 50: # Large files
+ max_pages = 5
+ elif file_size_mb > 20: # Medium files
+ max_pages = 10
+ else: # Small files
+ max_pages = None # Process all pages
+
+ # Start processing timer
+ start_time = time.time()
+
+ # Read and process the file
+ if file_type == "pdf":
+ result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
+ else:
+ result = self._process_image(file_path, use_vision)
+
+ # Add processing time information
+ processing_time = time.time() - start_time
+ result['processing_time'] = processing_time
+
+ # Add a default confidence score if not present
+ if 'confidence_score' not in result:
+ result['confidence_score'] = 0.85 # Default confidence
+
+ return result
+
+ def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
+ """Process a PDF file with OCR
+
+ Args:
+ file_path: Path to the PDF file
+ use_vision: Whether to use vision model
+ max_pages: Optional limit on the number of pages to process
+ custom_pages: Optional list of specific page numbers to process
+ """
+ logger = logging.getLogger("pdf_processor")
+ logger.info(f"Processing PDF: {file_path}")
+
+ try:
+ # Upload the PDF file
+ logger.info("Uploading PDF file to Mistral API")
+ uploaded_file = self.client.files.upload(
+ file={
+ "file_name": file_path.stem,
+ "content": file_path.read_bytes(),
+ },
+ purpose="ocr",
+ )
+
+ # Get a signed URL for the uploaded file
+ signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
+
+ # Process the PDF with OCR
+ logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
+ pdf_response = self.client.ocr.process(
+ document=DocumentURLChunk(document_url=signed_url.url),
+ model=OCR_MODEL,
+ include_image_base64=True
+ )
+
+ # Limit pages if requested
+ pages_to_process = pdf_response.pages
+ total_pages = len(pdf_response.pages)
+ limited_pages = False
+
+ logger.info(f"PDF has {total_pages} total pages")
+
+ # Handle custom page selection if provided
+ if custom_pages:
+ # Convert to 0-based indexing and filter valid page numbers
+ valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
+ if valid_indices:
+ pages_to_process = [pdf_response.pages[i] for i in valid_indices]
+ limited_pages = True
+ logger.info(f"Processing {len(valid_indices)} custom-selected pages")
+ # Otherwise handle max_pages limit
+ elif max_pages and total_pages > max_pages:
+ pages_to_process = pages_to_process[:max_pages]
+ limited_pages = True
+ logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
+
+ # Calculate average confidence score based on OCR response if available
+ confidence_score = 0.0
+ try:
+ # Some OCR APIs provide confidence scores
+ confidence_values = []
+ for page in pages_to_process:
+ if hasattr(page, 'confidence'):
+ confidence_values.append(page.confidence)
+
+ if confidence_values:
+ confidence_score = sum(confidence_values) / len(confidence_values)
+ else:
+ confidence_score = 0.85 # Default if no confidence scores available
+ except:
+ confidence_score = 0.85 # Default fallback
+
+ # Combine pages' markdown into a single string
+ all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
+
+ # Extract structured data using the appropriate model
+ if use_vision:
+ # Get base64 of first page for vision model
+ first_page_image = None
+ if pages_to_process and pages_to_process[0].images:
+ first_page_image = pages_to_process[0].images[0].image_base64
+
+ if first_page_image:
+ # Use vision model
+ logger.info(f"Using vision model: {VISION_MODEL}")
+ result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
+ else:
+ # Fall back to text-only model if no image available
+ logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+ else:
+ # Use text-only model
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+
+ # Add page limit info to result if needed
+ if limited_pages:
+ result['limited_pages'] = {
+ 'processed': len(pages_to_process),
+ 'total': total_pages
+ }
+
+ # Add confidence score
+ result['confidence_score'] = confidence_score
+
+ # Store the raw OCR response for image rendering
+ result['raw_response'] = pdf_response
+
+ logger.info(f"PDF processing completed successfully")
+ return result
+
+ except Exception as e:
+ logger.error(f"Error processing PDF: {str(e)}")
+ # Return basic result on error
+ return {
+ "file_name": file_path.name,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "confidence_score": 0.0,
+ "error": str(e),
+ "ocr_contents": {
+ "error": f"Failed to process PDF: {str(e)}",
+ "partial_text": "Document could not be fully processed."
+ }
+ }
+
+ def _process_image(self, file_path, use_vision=True):
+ """Process an image file with OCR"""
+ logger = logging.getLogger("image_processor")
+ logger.info(f"Processing image: {file_path}")
+
+ try:
+ # Read and encode the image file
+ logger.info("Encoding image for API")
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+
+ # Process the image with OCR
+ logger.info(f"Processing image with OCR using {OCR_MODEL}")
+ image_response = self.client.ocr.process(
+ document=ImageURLChunk(image_url=base64_data_url),
+ model=OCR_MODEL,
+ include_image_base64=True
+ )
+
+ # Get the OCR markdown from the first page
+ image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
+
+ # Calculate confidence score if available
+ confidence_score = 0.85 # Default value
+ try:
+ if hasattr(image_response.pages[0], 'confidence'):
+ confidence_score = image_response.pages[0].confidence
+ except:
+ pass
+
+ # Extract structured data using the appropriate model
+ if use_vision:
+ logger.info(f"Using vision model: {VISION_MODEL}")
+ result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
+ else:
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
+ result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
+
+ # Add confidence score
+ result['confidence_score'] = confidence_score
+
+ # Store the raw OCR response for image rendering
+ result['raw_response'] = image_response
+
+ logger.info("Image processing completed successfully")
+ return result
+
+ except Exception as e:
+ logger.error(f"Error processing image: {str(e)}")
+ # Return basic result on error
+ return {
+ "file_name": file_path.name,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "confidence_score": 0.0,
+ "error": str(e),
+ "ocr_contents": {
+ "error": f"Failed to process image: {str(e)}",
+ "partial_text": "Image could not be processed."
+ }
+ }
+
+ def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
+ """Extract structured data using vision model"""
+ try:
+ # Parse with vision model with a timeout
+ chat_response = self.client.chat.parse(
+ model=VISION_MODEL,
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ ImageURLChunk(image_url=image_base64),
+ TextChunk(text=(
+ f"This is a historical document's OCR in markdown:\n"
+ f"\n{ocr_markdown}\n.\n"
+ f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
+ f"Extract topics, languages, and organize the content logically."
+ ))
+ ],
+ },
+ ],
+ response_format=StructuredOCRModel,
+ temperature=0
+ )
+
+ # Convert the response to a dictionary
+ result = json.loads(chat_response.choices[0].message.parsed.json())
+
+ # Ensure languages is a list of strings, not Language enum objects
+ if 'languages' in result:
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
+
+ except Exception as e:
+ # Fall back to text-only model if vision model fails
+ print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
+ result = self._extract_structured_data_text_only(ocr_markdown, filename)
+
+ return result
+
+ def _extract_structured_data_text_only(self, ocr_markdown, filename):
+ """Extract structured data using text-only model"""
+ try:
+ # Parse with text-only model with a timeout
+ chat_response = self.client.chat.parse(
+ model=TEXT_MODEL,
+ messages=[
+ {
+ "role": "user",
+ "content": f"This is a historical document's OCR in markdown:\n"
+ f"\n{ocr_markdown}\n.\n"
+ f"Convert this into a structured JSON response with the OCR contents. "
+ f"Extract topics, languages, and organize the content logically."
+ },
+ ],
+ response_format=StructuredOCRModel,
+ temperature=0
+ )
+
+ # Convert the response to a dictionary
+ result = json.loads(chat_response.choices[0].message.parsed.json())
+
+ # Ensure languages is a list of strings, not Language enum objects
+ if 'languages' in result:
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
+
+ except Exception as e:
+ # Create a basic result if parsing fails
+ print(f"Text model failed: {str(e)}. Creating basic result.")
+ result = {
+ "file_name": filename,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "ocr_contents": {
+ "raw_text": ocr_markdown
+ }
+ }
+
+ return result
+
+# For testing directly
+if __name__ == "__main__":
+ import sys
+
+ if len(sys.argv) < 2:
+ print("Usage: python structured_ocr.py ")
+ sys.exit(1)
+
+ file_path = sys.argv[1]
+ processor = StructuredOCR()
+ result = processor.process_file(file_path)
+
+ print(json.dumps(result, indent=2))
\ No newline at end of file
diff --git a/config.py b/config.py
index a7a295d9a7d955b7232267a5b72b4bb6f0c8dfca..f999828a8664fad82e03b043f59bfff44e0b0b06 100644
--- a/config.py
+++ b/config.py
@@ -4,64 +4,14 @@ Configuration file for Mistral OCR processing.
Contains API key and other settings.
"""
import os
-import logging
-from dotenv import load_dotenv
-# Configure logging
-logger = logging.getLogger("config")
+# Your Mistral API key - get from Hugging Face secrets or environment variable
+# The priority order is: HF_SPACES environment var > regular environment var > empty string
+# Note: No default API key is provided for security reasons
+MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
+ os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
-# Load environment variables from .env file if it exists
-load_dotenv()
-
-# Mistral API key handling - prioritizing Hugging Face environment
-# Priority order:
-# 1. HF_API_KEY environment variable (Hugging Face standard)
-# 2. HUGGING_FACE_API_KEY environment variable (alternative name)
-# 3. HF_MISTRAL_API_KEY environment variable (for Hugging Face deployment)
-# 4. MISTRAL_API_KEY environment variable (fallback)
-# 5. Empty string (will show warning in app)
-
-MISTRAL_API_KEY = os.environ.get("HF_API_KEY",
- os.environ.get("HUGGING_FACE_API_KEY",
- os.environ.get("HF_MISTRAL_API_KEY",
- os.environ.get("MISTRAL_API_KEY", "")))).strip()
-
-if not MISTRAL_API_KEY:
- logger.warning("No Mistral API key found in environment variables. API functionality will be limited.")
-
-# Check if we're in test mode (allows operation without valid API key)
-# Set to False to use actual API calls with Mistral API
-TEST_MODE = False
-
-# Model settings with fallbacks
-OCR_MODEL = os.environ.get("MISTRAL_OCR_MODEL", "mistral-ocr-latest")
-TEXT_MODEL = os.environ.get("MISTRAL_TEXT_MODEL", "mistral-small-latest") # Updated from ministral-8b-latest
-VISION_MODEL = os.environ.get("MISTRAL_VISION_MODEL", "mistral-small-latest") # faster model that supports vision
-
-# Image preprocessing settings optimized for historical documents
-# These can be customized from environment variables
-IMAGE_PREPROCESSING = {
- "enhance_contrast": float(os.environ.get("ENHANCE_CONTRAST", "3.5")), # Increased contrast for better text recognition
- "sharpen": os.environ.get("SHARPEN", "True").lower() in ("true", "1", "yes"),
- "denoise": os.environ.get("DENOISE", "True").lower() in ("true", "1", "yes"),
- "max_size_mb": float(os.environ.get("MAX_IMAGE_SIZE_MB", "200.0")), # Increased size limit for better quality
- "target_dpi": int(os.environ.get("TARGET_DPI", "300")), # Target DPI for scaling
- "compression_quality": int(os.environ.get("COMPRESSION_QUALITY", "100")), # Higher quality for better OCR results
- # # Enhanced settings for handwritten documents
- "handwritten": {
- "block_size": int(os.environ.get("HANDWRITTEN_BLOCK_SIZE", "21")), # Larger block size for adaptive thresholding
- "constant": int(os.environ.get("HANDWRITTEN_CONSTANT", "5")), # Lower constant for adaptive thresholding
- "use_dilation": os.environ.get("HANDWRITTEN_DILATION", "True").lower() in ("true", "1", "yes"), # Connect broken strokes
- "dilation_iterations": int(os.environ.get("HANDWRITTEN_DILATION_ITERATIONS", "2")), # More iterations for better stroke connection
- "dilation_kernel_size": int(os.environ.get("HANDWRITTEN_DILATION_KERNEL_SIZE", "3")) # Larger kernel for dilation
- }
-}
-
-# OCR settings optimized for single-page performance
-OCR_SETTINGS = {
- "timeout_ms": int(os.environ.get("OCR_TIMEOUT_MS", "45000")), # Shorter timeout for single pages (45 seconds)
- "max_retries": int(os.environ.get("OCR_MAX_RETRIES", "2")), # Fewer retries to avoid rate-limiting
- "retry_delay": int(os.environ.get("OCR_RETRY_DELAY", "1")), # Shorter initial retry delay for faster execution
- "include_image_base64": os.environ.get("INCLUDE_IMAGE_BASE64", "True").lower() in ("true", "1", "yes"),
- "thread_count": int(os.environ.get("OCR_THREAD_COUNT", "2")) # Lower thread count to prevent API rate limiting
-}
\ No newline at end of file
+# Model settings
+OCR_MODEL = "mistral-ocr-latest"
+TEXT_MODEL = "ministral-8b-latest"
+VISION_MODEL = "pixtral-12b-latest"
\ No newline at end of file
diff --git a/constants.py b/constants.py
deleted file mode 100644
index d21422aa5c91f41ac6711eee6f45e33cbac4f304..0000000000000000000000000000000000000000
--- a/constants.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""
-Constants for the Historical OCR application.
-
-This module contains all the constants used throughout the application,
-making it easier to maintain and update values in one place.
-"""
-
-# API limits
-MAX_FILE_SIZE_MB = 200
-MAX_PAGES = 20
-
-# Caching
-CACHE_TTL_SECONDS = 24 * 3600 # 24 hours
-MAX_CACHE_ENTRIES = 20
-
-# Image processing
-MAX_IMAGE_DIMENSION = 2500
-IMAGE_QUALITY = 100
-
-# Document types
-DOCUMENT_TYPES = [
- "Auto-detect (standard processing)",
- "Newspaper or Magazine",
- "Letter or Correspondence",
- "Book or Publication",
- "Form or Legal Document",
- "Recipe",
- "Handwritten Document",
- "Map or Illustration",
- "Table or Spreadsheet",
- "Other (specify in instructions)"
-]
-
-# Document layouts
-DOCUMENT_LAYOUTS = [
- "Standard layout",
- "Multiple columns",
- "Table/grid format",
- "Mixed layout with images"
-]
-
-# Preprocessing document types
-PREPROCESSING_DOC_TYPES = ["standard", "handwritten", "typed", "printed"]
-
-# Rotation options
-ROTATION_OPTIONS = [0, 90, 180, 270]
-
-# PDF settings
-DEFAULT_PDF_DPI = 100
-MIN_PDF_DPI = 72
-MAX_PDF_DPI = 300
-DEFAULT_MAX_PAGES = 3
-
-# Performance modes
-PERFORMANCE_MODES = ["Quality", "Speed"]
-
-# Custom prompt templates
-CUSTOM_PROMPT_TEMPLATES = {
- "Newspaper or Magazine": "This is a newspaper/magazine. Process columns from top to bottom, capture headlines, bylines, article text and captions.",
- "Letter or Correspondence": "This is a letter/correspondence. Capture letterhead, date, greeting, body, closing and signature. Note any handwritten annotations.",
- "Book or Publication": "This is a book/publication. Extract titles, headers, footnotes, page numbers and body text. Preserve paragraph structure and any special formatting.",
- "Form or Legal Document": "This is a form/legal document. Extract all field labels and values, preserving the structure. Pay special attention to signature lines, dates, and any official markings.",
- "Recipe": "This is a recipe. Extract title, ingredients list with measurements, and preparation instructions. Maintain the distinction between ingredients and preparation steps.",
- "Handwritten Document": "This is a handwritten document. Carefully transcribe all handwritten text, preserving line breaks. Note any unclear sections or annotations.",
- "Map or Illustration": "This is a map or illustration. Transcribe all labels, legends, captions, and annotations. Note any scale indicators or directional markings.",
- "Table or Spreadsheet": "This is a table/spreadsheet. Preserve row and column structure, maintaining alignment of data. Extract headers and all cell values.",
- "Other (specify in instructions)": "Please describe the document type and any special processing requirements here."
-}
-
-# Layout prompt additions
-LAYOUT_PROMPT_ADDITIONS = {
- "Multiple columns": "Document has multiple columns. Read each column from top to bottom, then move to the next column.",
- "Table/grid format": "Document contains table data. Preserve row and column structure during extraction.",
- "Mixed layout with images": "Document has mixed text layout with images. Extract text in proper reading order."
-}
-
-# Content themes for subject tag extraction
-CONTENT_THEMES = {
- # Historical Periods
- "Prehistoric": ["paleolithic", "neolithic", "stone age", "bronze age", "iron age", "prehistoric", "ancient", "archaeology", "artifact", "primitive"],
- "Ancient World": ["mesopotamia", "egypt", "greek", "roman", "persia", "babylonian", "assyrian", "pharaoh", "hieroglyphics", "cuneiform", "classical", "antiquity", "hellenistic", "republic", "empire"],
- "Medieval": ["middle ages", "medieval", "feudal", "crusades", "byzantine", "carolingian", "holy roman empire", "dark ages", "castle", "knights", "chivalry", "monastery", "plague", "viking", "norse"],
- "Renaissance": ["renaissance", "humanism", "reformation", "counter-reformation", "medici", "tudor", "elizabethan", "shakespeare", "machiavelli", "gutenberg", "printing press"],
- "Early Modern": ["early modern", "enlightenment", "age of reason", "scientific revolution", "colonial", "colonization", "imperialism", "revolution", "baroque", "bourbon", "habsburg", "stuart"],
- "18th Century": ["18th century", "1700s", "revolution", "american revolution", "french revolution", "enlightenment", "rococo", "neoclassical", "voltaire", "rousseau", "industrial"],
- "19th Century": ["19th century", "1800s", "victorian", "romantic", "napoleonic", "civil war", "industrial revolution", "manifest destiny", "colonial", "imperialism", "belle epoque", "fin de siecle"],
- "20th Century": ["20th century", "1900s", "world war", "great depression", "cold war", "interwar", "postwar", "modernism", "atomic", "post-colonial", "totalitarian", "fascism", "soviet", "civil rights"],
- "Contemporary": ["contemporary", "modern", "postmodern", "digital age", "globalization", "information age", "post-industrial", "post-colonial", "post-soviet", "post-war", "21st century"],
-
- # Geographic Contexts
- "European History": ["europe", "western europe", "eastern europe", "central europe", "mediterranean", "nordic", "iberian", "british", "habsburg", "bourbon", "prussia", "holy roman empire"],
- "Asian History": ["asia", "east asia", "south asia", "central asia", "southeast asia", "china", "japan", "india", "persia", "ottoman", "mongolian", "dynasty", "shogunate", "mughal", "silk road"],
- "African History": ["africa", "north africa", "west africa", "east africa", "sub-saharan", "sahel", "swahili", "maghreb", "nubian", "ethiopian", "zulu", "colonial africa", "apartheid"],
- "American History": ["america", "colonial america", "revolutionary", "antebellum", "civil war", "reconstruction", "frontier", "westward expansion", "manifest destiny", "native american", "indigenous"],
- "Latin American": ["latin america", "mesoamerica", "caribbean", "aztec", "mayan", "inca", "colonial", "viceroyalty", "independence", "revolution", "hispanic", "creole", "mestizo", "indigenous"],
- "Oceanic History": ["oceania", "pacific", "australian", "aboriginal", "indigenous", "polynesian", "melanesian", "micronesian", "maori", "maritime", "exploration", "settlement", "colonial"],
-
- # Historical Methodologies & Approaches
- "Archival Research": ["archive", "manuscript", "primary source", "provenance", "document", "preservation", "cataloging", "repository", "collection", "papers", "fonds", "records", "registry"],
- "Oral History": ["oral history", "testimony", "interview", "narrative", "memory", "ethnography", "storytelling", "tradition", "folklore", "witness", "account", "recording", "indigenous knowledge"],
- "Historical Archaeology": ["archaeology", "excavation", "artifact", "material culture", "stratigraphy", "conservation", "field work", "site", "ruins", "preservation", "heritage", "restoration"],
- "Digital History": ["digital", "database", "digitization", "computational", "network analysis", "gis", "mapping", "visualization", "data mining", "text analysis", "digital humanities", "encoding"],
- "Historiography": ["historiography", "revisionism", "interpretation", "narrative", "discourse", "bias", "perspective", "theory", "methodology", "framework", "historical thinking", "meta-history"],
-
- # Historical Document Types
- "Administrative Records": ["record", "registry", "account", "ledger", "census", "tax roll", "inventory", "charter", "deed", "grant", "patent", "minutes", "docket", "survey", "assessment", "register"],
- "Diplomatic Documents": ["treaty", "agreement", "proclamation", "declaration", "diplomatic", "embassy", "consul", "dispatch", "communique", "protocol", "convention", "alliance", "international"],
- "Personal Papers": ["diary", "journal", "memoir", "autobiography", "correspondence", "letter", "personal", "private", "papers", "notes", "scrapbook", "commonplace book", "sketchbook"],
- "Media History": ["newspaper", "gazette", "periodical", "pamphlet", "broadside", "print culture", "press", "editorial", "journalism", "reporter", "editor", "circulation", "readership", "subscriber"],
- "Visual Materials": ["photograph", "illustration", "print", "map", "atlas", "cartography", "engraving", "woodcut", "lithograph", "panorama", "portrait", "landscape", "sketch", "drawing", "plate"],
- "Legal Documents": ["legal", "law", "statute", "code", "constitution", "legislation", "decree", "ordinance", "bylaw", "regulation", "case", "trial", "testimony", "deposition", "verdict", "judgment"],
-
- # Historical Themes & Movements
- "Economic History": ["economic", "commerce", "trade", "market", "merchant", "finance", "banking", "currency", "coin", "inflation", "recession", "depression", "exchange", "capital", "labor", "guild"],
- "Social History": ["social", "society", "class", "status", "hierarchy", "everyday life", "community", "neighborhood", "urban", "rural", "poverty", "wealth", "leisure", "entertainment", "customs"],
- "Political History": ["political", "politics", "government", "state", "monarchy", "republic", "democracy", "aristocracy", "parliament", "congress", "election", "regime", "policy", "reform", "revolution"],
- "Intellectual History": ["intellectual", "idea", "philosophy", "theory", "concept", "movement", "thought", "discourse", "debate", "enlightenment", "rationalism", "empiricism", "ideology"],
- "Cultural History": ["cultural", "culture", "custom", "tradition", "ritual", "ceremony", "festival", "celebration", "holiday", "folklore", "music", "art", "literature", "fashion", "consumption"],
- "Religious History": ["religious", "religion", "church", "theology", "belief", "faith", "worship", "ritual", "sacred", "clergy", "monastery", "temple", "mosque", "synagogue", "pilgrimage", "sect"],
- "Military History": ["military", "war", "conflict", "battle", "campaign", "siege", "army", "navy", "soldier", "officer", "regiment", "battalion", "artillery", "cavalry", "infantry", "strategy", "tactics"],
- "Science History": ["scientific", "science", "experiment", "discovery", "theory", "hypothesis", "observation", "laboratory", "academy", "research", "natural philosophy", "medicine", "technology"],
- "Environmental History": ["environmental", "ecology", "climate", "weather", "landscape", "agriculture", "farming", "forestry", "conservation", "pollution", "resource", "sustainability", "natural"],
-
- # Specialized Historical Topics
- "Migration History": ["migration", "immigration", "emigration", "diaspora", "exile", "refugee", "settlement", "colonization", "population movement", "forced migration", "displacement", "resettlement"],
- "Maritime History": ["maritime", "naval", "shipping", "navigation", "sailor", "piracy", "privateering", "admiralty", "port", "harbor", "shipyard", "vessel", "sail", "trade route", "exploration"],
- "Gender History": ["gender", "women", "feminist", "sexuality", "masculinity", "femininity", "patriarchy", "suffrage", "domestic", "family", "marriage", "emancipation", "rights", "equality"],
- "Labor History": ["labor", "worker", "union", "strike", "apprentice", "guild", "factory", "workshop", "wage", "hours", "working conditions", "industrialization", "mechanization", "automation"],
- "Urban History": ["urban", "city", "town", "metropolitan", "municipal", "civic", "suburb", "neighborhood", "planning", "infrastructure", "utilities", "housing", "development", "gentrification"],
- "Rural History": ["rural", "countryside", "village", "agricultural", "farming", "peasant", "yeoman", "tenant", "sharecropper", "enclosure", "common land", "manor", "estate", "plantation"],
- "Colonial History": ["colonial", "colony", "settlement", "frontier", "borderland", "territory", "dominion", "province", "governance", "administration", "native", "indigenous", "contact zone"],
- "Indigenous History": ["indigenous", "native", "aboriginal", "first nations", "tribal", "reservation", "sovereignty", "land rights", "treaty rights", "cultural preservation", "oral tradition"],
-
- # General Historical Terms
- "Historical": ["history", "historical", "historiography", "heritage", "legacy", "tradition", "memory", "commemoration", "preservation", "conservation", "restoration", "interpretation", "significance"],
- "Chronology": ["chronology", "timeline", "periodization", "era", "epoch", "age", "century", "decade", "millennium", "year", "date", "dating", "chronological", "contemporary", "synchronic", "diachronic"],
- "Heritage": ["heritage", "preservation", "conservation", "landmark", "monument", "historic site", "museum", "archive", "collection", "artifact", "relic", "antiquity", "cultural heritage", "patrimony"]
-}
-
-# Period tags based on year ranges
-# These ranges are used to assign historical period tags to documents based on their year.
-PERIOD_TAGS = {
- (0, 499): "Ancient Era (to 500 CE)",
- (500, 999): "Early Medieval (500–1000)",
- (1000, 1299): "High Medieval (1000–1300)",
- (1300, 1499): "Late Medieval (1300–1500)",
- (1500, 1599): "Renaissance (1500–1600)",
- (1600, 1699): "Early Modern (1600–1700)",
- (1700, 1775): "Enlightenment (1700–1775)",
- (1776, 1799): "Age of Revolutions (1776–1800)",
- (1800, 1849): "Early 19th Century (1800–1850)",
- (1850, 1899): "Late 19th Century (1850–1900)",
- (1900, 1918): "Early 20th Century & WWI (1900–1918)",
- (1919, 1938): "Interwar Period (1919–1938)",
- (1939, 1945): "World War II (1939–1945)",
- (1946, 1968): "Postwar & Mid-20th Century (1946–1968)",
- (1969, 1989): "Late 20th Century (1969–1989)",
- (1990, 2000): "Turn of the 21st Century (1990–2000)",
- (2001, 2099): "Contemporary (21st Century)"
-}
-
-# Default fallback tags for documents when no specific tags are detected.
-DEFAULT_TAGS = [
- "Document",
- "Historical",
- "Text",
- "Primary Source",
- "Archival Material",
- "Record",
- "Manuscript",
- "Printed Material",
- "Correspondence",
- "Publication"
-]
-
-# Generic tags that can be used for broad categorization or as supplemental tags.
-GENERIC_TAGS = [
- "Archive",
- "Content",
- "Record",
- "Source",
- "Material",
- "Page",
- "Scan",
- "Image",
- "Transcription",
- "Uncategorized",
- "General",
- "Miscellaneous"
-]
-
-# UI constants
-PROGRESS_DELAY = 0.8 # Seconds to show completion message
diff --git a/error_handler.py b/error_handler.py
deleted file mode 100644
index 18750cdf24b9aa67e24b1e25b465bf5a11ff538e..0000000000000000000000000000000000000000
--- a/error_handler.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import logging
-import streamlit as st
-import time
-from constants import MAX_FILE_SIZE_MB
-
-# Configure logging
-logger = logging.getLogger("error_handler")
-logger.setLevel(logging.INFO)
-
-def handle_ocr_error(exception, progress_reporter=None):
- """
- Handle OCR processing errors and provide user-friendly messages
-
- Args:
- exception: The exception that occurred
- progress_reporter: ProgressReporter instance for UI updates
-
- Returns:
- str: User-friendly error message
- """
- error_message = str(exception)
-
- # Complete progress reporting if provided
- if progress_reporter:
- progress_reporter.complete(success=False)
-
- # Check for specific error types and provide helpful user-facing messages
- if "rate limit" in error_message.lower() or "429" in error_message or "requests rate limit exceeded" in error_message.lower():
- friendly_message = "The AI service is currently experiencing high demand. Please try again in a few minutes."
- logger.error(f"Rate limit error: {error_message}")
- return friendly_message
- elif "quota" in error_message.lower() or "credit" in error_message.lower() or "subscription" in error_message.lower():
- friendly_message = "The API usage quota has been reached. Please check your API key and subscription limits."
- logger.error(f"API quota error: {error_message}")
- return friendly_message
- elif "timeout" in error_message.lower() or "timed out" in error_message.lower():
- friendly_message = "The request timed out. This may be due to a large document or high server load. Please try again or use a smaller document."
- logger.error(f"Timeout error: {error_message}")
- return friendly_message
- elif "file size" in error_message.lower() or "too large" in error_message.lower():
- friendly_message = f"The file is too large. Maximum file size is {MAX_FILE_SIZE_MB}MB."
- logger.error(f"File size error: {error_message}")
- return friendly_message
- else:
- # Generic error message for other errors
- logger.error(f"OCR processing error: {error_message}", exc_info=True)
- return f"An error occurred during processing: {error_message}"
-
-def check_file_size(file_bytes):
- """
- Check if file size is within limits
-
- Args:
- file_bytes: File content as bytes
-
- Returns:
- tuple: (is_valid, file_size_mb, error_message)
- """
- file_size_mb = len(file_bytes) / (1024 * 1024)
-
- if file_size_mb > MAX_FILE_SIZE_MB:
- error_message = f"File size {file_size_mb:.2f} MB exceeds limit of {MAX_FILE_SIZE_MB} MB"
- return False, file_size_mb, error_message
-
- return True, file_size_mb, None
diff --git a/image_segmentation.py b/image_segmentation.py
deleted file mode 100644
index b0137dcedb1947f229aa0cc238fb43cbdf309325..0000000000000000000000000000000000000000
--- a/image_segmentation.py
+++ /dev/null
@@ -1,253 +0,0 @@
-"""
-Image segmentation utility for OCR preprocessing.
-Separates text regions from image regions to improve OCR accuracy on mixed-content documents.
-Uses content-aware adaptive segmentation for improved results across document types.
-"""
-
-import cv2
-import numpy as np
-from PIL import Image
-import io
-import base64
-import logging
-from pathlib import Path
-from typing import Tuple, List, Dict, Union, Optional
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-def segment_image_for_ocr(image_path: Union[str, Path], vision_enabled: bool = True, preserve_content: bool = True) -> Dict[str, Union[Image.Image, str]]:
- """
- Prepare image for OCR processing using content-aware segmentation.
- Uses adaptive region detection based on text density analysis.
-
- Args:
- image_path: Path to the image file
- vision_enabled: Whether the vision model is enabled
- preserve_content: Whether to preserve original content without enhancement
-
- Returns:
- Dict containing segmentation results
- """
- # Convert to Path object if string
- image_file = Path(image_path) if isinstance(image_path, str) else image_path
-
- # Log start of processing
- logger.info(f"Preparing image for Mistral OCR: {image_file.name}")
-
- try:
- # Open original image with PIL
- with Image.open(image_file) as pil_img:
- # Check for low entropy images when vision is disabled
- if not vision_enabled:
- from utils.image_utils import calculate_image_entropy
- ent = calculate_image_entropy(pil_img)
- if ent < 3.5: # Likely line-art or blank page
- logger.info(f"Low entropy image detected ({ent:.2f}), classifying as illustration")
- return {
- 'text_regions': None,
- 'image_regions': pil_img,
- 'text_mask_base64': None,
- 'combined_result': None,
- 'text_regions_coordinates': []
- }
-
- # Convert to RGB if needed
- if pil_img.mode != 'RGB':
- pil_img = pil_img.convert('RGB')
-
- # Get image dimensions
- img_np = np.array(pil_img)
- img_width, img_height = pil_img.size
-
- # Analyze text density to determine if advanced segmentation is needed
- # This replaces document-specific logic with content-aware analysis
- from utils.image_utils import estimate_text_density
- text_density = estimate_text_density(img_np)
-
- # Use adaptive approach for documents with unusual text distribution
- if text_density['pattern'] == 'varied' or text_density['uppercase_sections'] > 0:
- logger.info(f"Using adaptive segmentation for document with varied text density pattern={text_density['pattern']}, uppercase_sections={text_density['uppercase_sections']}")
-
- # Detect content regions based on text density
- from utils.text_utils import detect_content_regions
- regions = detect_content_regions(img_np)
-
- # Create visualization with green borders around the text regions
- vis_img = img_np.copy()
-
- # Draw regions on visualization
- for x, y, w, h in regions:
- cv2.rectangle(vis_img, (x, y), (x+w, y+h), (0, 255, 0), 3)
-
- # Add text to indicate we're using adaptive processing
- font = cv2.FONT_HERSHEY_SIMPLEX
- cv2.putText(vis_img, "Adaptive region processing", (30, 60), font, 1, (0, 255, 0), 2)
-
- # Create visualization images
- text_regions_vis = Image.fromarray(vis_img)
- image_regions_vis = text_regions_vis.copy()
-
- # Create a mask highlighting the text regions
- text_mask = np.zeros((img_height, img_width), dtype=np.uint8)
- for x, y, w, h in regions:
- text_mask[y:y+h, x:x+w] = 255
-
- _, buffer = cv2.imencode('.png', text_mask)
- text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
-
- # Extract region images
- region_images = []
- for i, (x, y, w, h) in enumerate(regions):
- region = img_np[y:y+h, x:x+w].copy()
- region_pil = Image.fromarray(region)
-
- region_info = {
- 'image': region,
- 'pil_image': region_pil,
- 'coordinates': (x, y, w, h),
- 'padded_coordinates': (x, y, w, h),
- 'order': i
- }
- region_images.append(region_info)
-
- # Return the adaptive segmentation results
- return {
- 'text_regions': text_regions_vis,
- 'image_regions': image_regions_vis,
- 'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
- 'combined_result': pil_img,
- 'text_regions_coordinates': regions,
- 'region_images': region_images,
- 'segmentation_type': 'adaptive'
- }
- else:
- # SIMPLIFIED APPROACH for most documents
- # Let Mistral OCR handle the entire document understanding process
- logger.info(f"Using standard approach for document with uniform text density")
-
- # For visualization, mark the entire image as a text region
- full_image_region = [(0, 0, img_width, img_height)]
-
- # Create visualization with a simple border
- vis_img = img_np.copy()
- cv2.rectangle(vis_img, (5, 5), (img_width-5, img_height-5), (0, 255, 0), 5)
-
- # Add text to indicate this is using Mistral's native processing
- font = cv2.FONT_HERSHEY_SIMPLEX
- cv2.putText(vis_img, "Processed by Mistral OCR", (30, 60), font, 1, (0, 255, 0), 2)
-
- # Create visualizations and masks
- text_regions_vis = Image.fromarray(vis_img)
- image_regions_vis = text_regions_vis.copy()
-
- # Create a mask of the entire image (just for visualization)
- text_mask = np.ones((img_height, img_width), dtype=np.uint8) * 255
- _, buffer = cv2.imencode('.png', text_mask)
- text_mask_base64 = base64.b64encode(buffer).decode('utf-8')
-
- # Return the original image as the combined result
- return {
- 'text_regions': text_regions_vis,
- 'image_regions': image_regions_vis,
- 'text_mask_base64': f"data:image/png;base64,{text_mask_base64}",
- 'combined_result': pil_img,
- 'text_regions_coordinates': full_image_region,
- 'region_images': [{
- 'image': img_np,
- 'pil_image': pil_img,
- 'coordinates': (0, 0, img_width, img_height),
- 'padded_coordinates': (0, 0, img_width, img_height),
- 'order': 0
- }],
- 'segmentation_type': 'simplified'
- }
-
- except Exception as e:
- logger.error(f"Error segmenting image {image_file.name}: {str(e)}")
- # Return None values if processing fails
- return {
- 'text_regions': None,
- 'image_regions': None,
- 'text_mask_base64': None,
- 'combined_result': None,
- 'text_regions_coordinates': []
- }
-
-def process_segmented_image(image_path: Union[str, Path], output_dir: Optional[Path] = None, preserve_content: bool = True) -> Dict:
- """
- Process an image using segmentation for improved OCR, saving visualization outputs.
-
- Args:
- image_path: Path to the image file
- output_dir: Optional directory to save visualization outputs
-
- Returns:
- Dictionary with processing results and paths to output files
- """
- # Convert to Path object if string
- image_file = Path(image_path) if isinstance(image_path, str) else image_path
-
- # Create output directory if not provided
- if output_dir is None:
- output_dir = Path("output") / "segmentation"
- output_dir.mkdir(parents=True, exist_ok=True)
-
- # Process the image with segmentation
- segmentation_results = segment_image_for_ocr(image_file)
-
- # Prepare results dictionary
- results = {
- 'original_image': str(image_file),
- 'output_files': {}
- }
-
- # Save visualization outputs if segmentation was successful
- if segmentation_results['text_regions'] is not None:
- # Save text regions visualization
- text_regions_path = output_dir / f"{image_file.stem}_text_regions.jpg"
- segmentation_results['text_regions'].save(text_regions_path)
- results['output_files']['text_regions'] = str(text_regions_path)
-
- # Save image regions visualization
- image_regions_path = output_dir / f"{image_file.stem}_image_regions.jpg"
- segmentation_results['image_regions'].save(image_regions_path)
- results['output_files']['image_regions'] = str(image_regions_path)
-
- # Save combined result
- combined_path = output_dir / f"{image_file.stem}_combined.jpg"
- segmentation_results['combined_result'].save(combined_path)
- results['output_files']['combined_result'] = str(combined_path)
-
- # Save text mask visualization
- text_mask_path = output_dir / f"{image_file.stem}_text_mask.png"
- # Save text mask from base64
- if segmentation_results['text_mask_base64']:
- base64_data = segmentation_results['text_mask_base64'].split(',')[1]
- with open(text_mask_path, 'wb') as f:
- f.write(base64.b64decode(base64_data))
- results['output_files']['text_mask'] = str(text_mask_path)
-
- # Add detected text regions count
- results['text_regions_count'] = len(segmentation_results['text_regions_coordinates'])
- results['text_regions_coordinates'] = segmentation_results['text_regions_coordinates']
-
- return results
-
-if __name__ == "__main__":
- # Simple test - process a sample image if run directly
- import sys
-
- if len(sys.argv) > 1:
- image_path = sys.argv[1]
- else:
- image_path = "input/handwritten-journal.jpg" # Example image path"
-
- logger.info(f"Testing image segmentation on {image_path}")
- results = process_segmented_image(image_path)
-
- # Print results summary
- logger.info(f"Segmentation complete. Found {results.get('text_regions_count', 0)} text regions.")
- logger.info(f"Output files saved to: {[path for path in results.get('output_files', {}).values()]}")
diff --git a/input/The Magician, or Bottle Cungerer.jpeg b/input/The Magician, or Bottle Cungerer.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..8e2045676510f0e481670d6763f3930ae6b203da
--- /dev/null
+++ b/input/The Magician, or Bottle Cungerer.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
+size 2960280
diff --git a/input/americae-retectio.jpg b/input/americae-retectio.jpg
deleted file mode 100644
index e23be5e413f0b2c5a8421cb7b4e93a64819bd439..0000000000000000000000000000000000000000
--- a/input/americae-retectio.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ea42f6d3f7c0331a08321c26978c9011843965de99735a178de8167fdede544
-size 451559
diff --git a/input/baldwin-letter-1.jpg b/input/baldwin-letter-1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4e6b833fc8ab797f608dfe7c4e92642ca8b773d3
--- /dev/null
+++ b/input/baldwin-letter-1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
+size 135244
diff --git a/input/baldwin-letter-2.jpg b/input/baldwin-letter-2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b19ec5985abd7d36aef556b7427d2524c54d5d13
--- /dev/null
+++ b/input/baldwin-letter-2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
+size 114136
diff --git a/input/baldwin-letter.jpg b/input/baldwin-letter.jpg
deleted file mode 100644
index 3107ff733b3bd2d0bb820dda6dd17dc65b7766fe..0000000000000000000000000000000000000000
--- a/input/baldwin-letter.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bf2feb9fdf509fdb58b5c8f3b9a4c2fbc6e21d3e55c19b8f3ade8e37cccfe030
-size 195092
diff --git a/input/flier.png b/input/flier.png
new file mode 100644
index 0000000000000000000000000000000000000000..a02e7743490614e3f9884a88aa9ad15214609a34
Binary files /dev/null and b/input/flier.png differ
diff --git a/input/gender.jpg b/input/gender.jpg
deleted file mode 100644
index 5c9db9cffd3f12d592fe733fee7e347616a814b4..0000000000000000000000000000000000000000
--- a/input/gender.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:af612b5c7ef59bc6c9bf7cbcedb88c69628ff8c46c66f1cb8df83729f5df59f8
-size 129906
diff --git a/input/handwritten-journal.jpg b/input/handwritten-journal.jpg
deleted file mode 100644
index eebd5f6fea26e404b20d427b2d1379d0515ec226..0000000000000000000000000000000000000000
--- a/input/handwritten-journal.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:279f7c915ae54aafb30e6d70e480eb74e73b6aa92de20f60cd13019e9debbb62
-size 1459485
diff --git a/input/harpers.pdf b/input/harpers.pdf
deleted file mode 100644
index 8a1176f2f7b8a2d74f042a5415d6946d8e4a821b..0000000000000000000000000000000000000000
--- a/input/harpers.pdf
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3c9030714b07bb5f7c9adf8b175975baa9b4f40402da62d69cad9b0d4ba61b94
-size 14931299
diff --git a/input/letter-1.jpg b/input/letter-1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4e6b833fc8ab797f608dfe7c4e92642ca8b773d3
--- /dev/null
+++ b/input/letter-1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
+size 135244
diff --git a/input/letter-2.jpg b/input/letter-2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b19ec5985abd7d36aef556b7427d2524c54d5d13
--- /dev/null
+++ b/input/letter-2.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
+size 114136
diff --git a/input/letter-3.jpg b/input/letter-3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fa0cc25dd4affd353900e34e9e986d6fa435ee8e
--- /dev/null
+++ b/input/letter-3.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
+size 230837
diff --git a/input/magician-satire.jpg b/input/magician-satire.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8e2045676510f0e481670d6763f3930ae6b203da
--- /dev/null
+++ b/input/magician-satire.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
+size 2960280
diff --git a/input/menu.pdf b/input/menu.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..276af61cf2f4ac361eaa86b3c430c527b1e59230
--- /dev/null
+++ b/input/menu.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
+size 2554815
diff --git a/input/milgram-flier.png b/input/milgram-flier.png
index aaa701cfd54519b72b05ee861537c7f537c266af..a02e7743490614e3f9884a88aa9ad15214609a34 100644
Binary files a/input/milgram-flier.png and b/input/milgram-flier.png differ
diff --git a/input/okeefe-menu.pdf b/input/okeefe-menu.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..276af61cf2f4ac361eaa86b3c430c527b1e59230
--- /dev/null
+++ b/input/okeefe-menu.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
+size 2554815
diff --git a/input/okeefe-recipe.jpg b/input/okeefe-recipe.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1701f0ba9840b4b978a4bae4d14c2780516c5f26
Binary files /dev/null and b/input/okeefe-recipe.jpg differ
diff --git a/input/photo-baldwin-letter.jpg b/input/photo-baldwin-letter.jpg
deleted file mode 100644
index 3b8b36c3230dc68f3a3fd6fe5bc5a629cd25f859..0000000000000000000000000000000000000000
--- a/input/photo-baldwin-letter.jpg
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c34d82f1e463aa471c326488d02b6ec1e7083842bfd9456292c94b2d54c71d96
-size 245212
diff --git a/input/recipe.jpg b/input/recipe.jpg
index 880bf752d43db6863a80c7af9c235d49a5f6954c..1701f0ba9840b4b978a4bae4d14c2780516c5f26 100644
Binary files a/input/recipe.jpg and b/input/recipe.jpg differ
diff --git a/letterhead_handler.py b/letterhead_handler.py
deleted file mode 100644
index 19e0c78799f2a3a7d497d144d23ff86d407c3609..0000000000000000000000000000000000000000
--- a/letterhead_handler.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""
-Specialized handler for letterhead and marginalia documents.
-Enhances OCR quality by providing document-specific prompts for common layouts.
-"""
-
-import re
-import logging
-from pathlib import Path
-from typing import Union, Dict, Any, Optional, List
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-def is_likely_letterhead(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> bool:
- """
- Detect if an image is likely a letterhead document with marginalia.
- Uses path/filename patterns and optional image features (if provided).
-
- Args:
- image_path: Path to the image file
- features: Optional dict of image features from preprocessing
-
- Returns:
- bool: True if likely a letterhead document
- """
- # Convert to string path for pattern matching
- path_str = str(image_path).lower()
-
- # Check for common letterhead filename patterns
- letterhead_patterns = [
- r'letter(head)?[^/]*\.jpg',
- r'hotel[^/]*\.jpg',
- r'baldwin.*\.jpg',
- r'business.*letter.*\.jpg',
- r'correspondence.*\.jpg'
- ]
-
- for pattern in letterhead_patterns:
- if re.search(pattern, path_str):
- logger.info(f"Detected likely letterhead document: {Path(image_path).name}")
- return True
-
- # If features are provided, use them for additional detection
- if features:
- # Check for ALL CAPS sections that might be marginalia
- if features.get('uppercase_sections', 0) > 1:
- logger.info(f"Detected likely letterhead document with marginalia by features: {Path(image_path).name}")
- return True
-
- return False
-
-def get_letterhead_prompt(image_path: Union[str, Path], features: Optional[Dict[str, Any]] = None) -> Optional[str]:
- """
- Generate a specialized prompt for letterhead documents to improve OCR quality.
-
- Args:
- image_path: Path to the image file
- features: Optional dict of image features from preprocessing
-
- Returns:
- str: Custom prompt for letterhead OCR or None if not applicable
- """
- if not is_likely_letterhead(image_path, features):
- return None
-
- # Path-specific customizations for known problematic documents
- path_str = str(image_path).lower()
-
- # Most specialized prompt for baldwin documents
- if "baldwin" in path_str:
- return """
- This image shows a hotel letterhead with a handwritten letter. Please extract the text with the following guidelines:
-
- 1. Identify and separate the letterhead elements:
- - Header: The hotel name, address, and contact information at the top
- - Marginalia: The amenities description in ALL CAPS along the margins
-
- 2. Extract the main handwritten letter content separately
-
- 3. Note any image captions separately
-
- 4. Format the output as follows:
- - HEADER: [header text]
- - MARGINS: [marginalia text]
- - LETTER: [handwritten letter text]
- - CAPTIONS: [any image captions]
-
- Be careful not to duplicate content between sections, especially with margin text.
- """
-
- # General letterhead prompt
- return """
- This appears to be a letterhead document. Please extract the text with the following guidelines:
-
- 1. Identify the header/letterhead section with company name, logo, address, etc.
- 2. Identify any margin text or notes that appear separate from the main content
- 3. Extract the main letter/document body separately
- 4. Format the output as follows:
- - LETTERHEAD: [letterhead text]
- - MARGIN_NOTES: [any text in margins]
- - BODY: [main document body]
-
- Be careful not to duplicate content between sections.
- """
-
-def clean_letterhead_ocr_output(text: str) -> str:
- """
- Clean OCR output from letterhead documents by handling section markers
- and reducing duplication.
-
- Args:
- text: OCR text from letterhead document
-
- Returns:
- str: Cleaned text with proper section formatting
- """
- if not text:
- return ""
-
- # Find any section markers added by the specialized prompt
- section_markers = [
- "HEADER:", "LETTERHEAD:", "MARGINS:", "MARGIN_NOTES:",
- "LETTER:", "BODY:", "CAPTIONS:"
- ]
-
- # Check if the text has any section markers
- has_sections = any(marker in text for marker in section_markers)
-
- if has_sections:
- # Split text into sections while preserving section headers
- sections = {}
- current_section = "UNKNOWN"
- current_text = []
-
- for line in text.split('\n'):
- # Check if this line is a section marker
- is_marker = False
- for marker in section_markers:
- if marker in line:
- # Save previous section
- if current_text:
- sections[current_section] = '\n'.join(current_text).strip()
- current_text = []
-
- # Start new section
- current_section = marker.replace(':', '')
- # Keep any text after the marker on this line
- remainder = line.split(marker, 1)[1].strip()
- if remainder:
- current_text.append(remainder)
- is_marker = True
- break
-
- # If not a marker, add to current section
- if not is_marker:
- current_text.append(line)
-
- # Save the last section
- if current_text:
- sections[current_section] = '\n'.join(current_text).strip()
-
- # Format with standard order and clear section headers
- formatted_sections = []
-
- # First add letterhead/header info
- if "LETTERHEAD" in sections:
- formatted_sections.append(f"--- LETTERHEAD ---\n{sections['LETTERHEAD']}")
- elif "HEADER" in sections:
- formatted_sections.append(f"--- LETTERHEAD ---\n{sections['HEADER']}")
-
- # Add margins/notes
- if "MARGIN_NOTES" in sections:
- formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGIN_NOTES']}")
- elif "MARGINS" in sections:
- formatted_sections.append(f"--- MARGIN NOTES ---\n{sections['MARGINS']}")
-
- # Add main content
- if "BODY" in sections:
- formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['BODY']}")
- elif "LETTER" in sections:
- formatted_sections.append(f"--- DOCUMENT BODY ---\n{sections['LETTER']}")
-
- # Add captions if present
- if "CAPTIONS" in sections:
- formatted_sections.append(f"--- IMAGE CAPTIONS ---\n{sections['CAPTIONS']}")
-
- # Add unknown sections
- if "UNKNOWN" in sections and sections["UNKNOWN"]:
- formatted_sections.append(f"--- ADDITIONAL CONTENT ---\n{sections['UNKNOWN']}")
-
- # Join everything with clear separation
- return "\n\n".join(formatted_sections)
- else:
- # If no section markers were found, return the original text
- return text
\ No newline at end of file
diff --git a/modules/content/__init__.py b/modules/content/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5767eca0096d8d3be1514fcb79c97e0f4caf7b
--- /dev/null
+++ b/modules/content/__init__.py
@@ -0,0 +1,36 @@
+"""
+Module initialization file for the workshop modules.
+"""
+from . import module1, module2, module3, module4, module5, module6
+
+# Module registry for easy access by module number
+modules = {
+ 1: module1,
+ 2: module2,
+ 3: module3,
+ 4: module4,
+ 5: module5,
+ 6: module6
+}
+
+# Module names for navigation and display
+module_names = [
+ "Introduction",
+ "Text-Image Relations",
+ "OCR Technology",
+ "Methodological Approaches",
+ "Interactive OCR",
+ "Conclusion"
+]
+
+def get_module(module_number):
+ """Get a module by its number (1-6)"""
+ if module_number in modules:
+ return modules[module_number]
+ raise ValueError(f"Unknown module number: {module_number}")
+
+def get_module_name(module_number):
+ """Get a module name by its number (1-6)"""
+ if 1 <= module_number <= len(module_names):
+ return module_names[module_number - 1]
+ return f"Module {module_number}"
\ No newline at end of file
diff --git a/modules/content/module1.py b/modules/content/module1.py
new file mode 100644
index 0000000000000000000000000000000000000000..822f36dd501269596ad5f37b510027a362eb375a
--- /dev/null
+++ b/modules/content/module1.py
@@ -0,0 +1,85 @@
+import streamlit as st
+from layout import gray_container, blue_container, yellow_container, card_grid, key_concept
+
+def render():
+ """Module 1: Introduction and Problematization"""
+
+ st.title("Module 1: Introduction and Problematization")
+
+ # Workshop overview in gray container
+ overview_content = """
+
Workshop Overview
+
+ This interactive workshop explores the application of OCR technology to historical documents,
+ combining theoretical understanding with practical experiences. Designed for historians,
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
+
+ """
+ gray_container(overview_content)
+
+ # For historians section with blue background
+ historians_content = """
+
For Historians:
+
+ How might OCR technology transform our access to and interpretation of historical
+ documents? What new research questions become possible when large archives
+ become machine-readable?
+
+ """
+ blue_container(historians_content)
+
+ # What is OCR section with yellow background
+ ocr_content = """
+
What is OCR?
+
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
+ Modern OCR uses AI vision models to understand both the text and its visual context.
+
How might the capabilities of vision-language models change our approach to digitizing historical archives?
+ """
+ research_question(research_content)
+
+ # Display history if available
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
+ with st.expander("Your OCR Processing History"):
+ st.markdown("You've already processed the following documents:")
+
+ for item in st.session_state.processing_history:
+ st.markdown(f"**{item['fileName']}**")
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
+ with col2:
+ st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
\ No newline at end of file
diff --git a/modules/content/module4.py b/modules/content/module4.py
new file mode 100644
index 0000000000000000000000000000000000000000..81dac95f5f7b62960b00dbe68f0ed1b7be357812
--- /dev/null
+++ b/modules/content/module4.py
@@ -0,0 +1,124 @@
+import streamlit as st
+from pathlib import Path
+from layout import gray_container, tool_container, key_concept, quote
+
+def render():
+ """Module 4: Methodological Approaches"""
+
+ st.title("Module 4: Methodological Approaches")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ hybrid_content = """
+
Hybrid Methodologies
+
+
1. Computational + Human Reading
+
+
OCR for initial processing and discovery
+
Human review for context and interpretation
+
Iterative refinement of computational outputs
+
+
+
2. Close + Distant Reading
+
+
Distant reading through large-scale OCR processing
+
Close reading of selected passages
+
Zooming between scales of analysis
+
+ """
+ gray_container(hybrid_content)
+
+ # Check if the diagram image is available and display it
+ input_dir = Path(__file__).parent.parent / "input"
+ diagram_path = input_dir / "diagram.jpg"
+
+ if diagram_path.exists():
+ try:
+ from PIL import Image
+ with Image.open(diagram_path) as img:
+ st.image(img, caption="Historical VLM architecture", use_column_width=True)
+ except Exception:
+ # If there's an error, just show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
+ else:
+ # If the file doesn't exist, show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
+
+ with col2:
+ mistral_content = """
+
Mistral-OCR-Latest: State-of-the-Art
+
+
The Mistral-OCR model represents a significant advancement:
+
+
Multimodal Understanding: Processes both visual and textual information
Historical Font Adaptation: Trained on diverse historical typography
+
+ """
+ gray_container(mistral_content)
+
+ # Check if the workflow image is available and display it
+ workflow_path = input_dir / "workflow.jpg"
+
+ if workflow_path.exists():
+ try:
+ from PIL import Image
+ with Image.open(workflow_path) as img:
+ st.image(img, caption="Mistral OCR workflow", use_column_width=True)
+ except Exception:
+ # If there's an error, just show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
+ else:
+ # If the file doesn't exist, show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
+
+ # Practical workflow section
+ workflow_content = """
+
Practical Workflow
+
+
A typical historical OCR workflow with Mistral-OCR includes:
+
+
Selection: Choosing appropriate documents
+
Preprocessing: Enhancing images before OCR
+
OCR Processing: Running documents through vision-enhanced OCR
+
Post-processing: Cleaning up outputs and structured extraction
+
Verification: Cross-checking results against originals
+
Integration: Incorporating OCR outputs into research materials
+ """
+ gray_container(limitations_content)
+
+ # Quote
+ quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
+ quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")
\ No newline at end of file
diff --git a/modules/content/module5.py b/modules/content/module5.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc50f5e03de8e234a6c3369da6b68ce4a0faee0
--- /dev/null
+++ b/modules/content/module5.py
@@ -0,0 +1,547 @@
+import streamlit as st
+import io
+import tempfile
+from pathlib import Path
+from datetime import datetime
+from layout import tool_container, key_concept, research_question, upload_container
+import sys
+
+# Import the necessary modules for OCR processing
+sys.path.append(str(Path(__file__).parent.parent))
+try:
+ from process_file import process_file as process_file_util
+ process_file = process_file_util
+except ImportError:
+ # Fallback if process_file is not available
+ def process_file(uploaded_file, use_vision=True, custom_prompt=None):
+ """Fallback function for processing files"""
+ st.warning("Using mock processing function. Real OCR functionality is not available.")
+ return {
+ "file_name": uploaded_file.name,
+ "languages": ["English"],
+ "topics": ["History", "Document"],
+ "ocr_contents": {
+ "content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
+ }
+ }
+
+def render():
+ """Module 5: Interactive OCR Experiment"""
+
+ st.title("Module 5: Interactive OCR Experiment")
+
+ # Introduction to the interactive experiment
+ intro_content = """
+
Interactive OCR Experiment
+
+ This interactive experiment allows you to process historical documents with OCR and analyze the results.
+ Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
+
+ """
+ st.markdown(intro_content, unsafe_allow_html=True)
+
+ # Create tabs for different activities
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
+
+ # Try to import PDF tools if available
+ try:
+ from pdf2image import convert_from_bytes
+ pdf_support = True
+ except ImportError:
+ pdf_support = False
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
+
+ with experiment_tab:
+ # Create a two-column layout
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ # Tool container for document selection and options
+ st.subheader("Step 1: Select Document & Options")
+
+ # Processing options
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="Use the vision model for improved analysis")
+
+ # Additional prompt
+ st.markdown("### Custom Research Prompt (Optional)")
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
+ Focus on specific aspects of historical research you're interested in.""")
+ custom_prompt = st.text_area("Research Prompt",
+ placeholder="E.g., Focus on identifying dates and historical figures...",
+ help="Optional instructions to guide the analysis")
+
+ # Sample document selection
+ input_dir = Path(__file__).parent.parent / "input"
+
+ if input_dir.exists():
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
+
+ if sample_files:
+ st.markdown("#### Sample Documents")
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
+ sample_choice = st.selectbox("Choose a document:", sample_options)
+
+ if sample_choice != "Upload my own document":
+ # Process the selected sample file
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
+
+ if selected_file:
+ # Store the selected sample file in session state
+ with open(selected_file, "rb") as f:
+ file_bytes = f.read()
+
+ st.session_state.sample_file = {
+ "name": selected_file.name,
+ "bytes": file_bytes
+ }
+
+ # Preview the selected sample
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
+ try:
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"PDF selected: {selected_file.name}")
+ else:
+ # For images display directly
+ try:
+ from PIL import Image
+ img = Image.open(io.BytesIO(file_bytes))
+ st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"Selected: {selected_file.name}")
+ else:
+ # Clear the sample file if "Upload my own" is selected
+ if 'sample_file' in st.session_state:
+ del st.session_state.sample_file
+
+ # Display file uploader
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is not None:
+ # Display preview of the uploaded file
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ else:
+ # No sample files, just show the uploader
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is not None:
+ # Display the file preview
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ else:
+ # No input directory
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ # Process button
+ st.subheader("Step 2: Process the Document")
+
+ # Get the file to process (either uploaded or sample)
+ file_to_process = None
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
+ # Create a FileUploader-like object from the sample file
+ class SampleFileObject:
+ def __init__(self, name, data):
+ self.name = name
+ self._data = data
+
+ def getvalue(self):
+ return self._data
+
+ file_to_process = SampleFileObject(
+ st.session_state.sample_file["name"],
+ st.session_state.sample_file["bytes"]
+ )
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
+ file_to_process = uploaded_file
+
+ # Process button
+ process_button = st.button(
+ "Process Document",
+ disabled=file_to_process is None,
+ use_container_width=True
+ )
+
+ if process_button and file_to_process is not None:
+ with st.spinner("Processing document..."):
+ try:
+ # Process the file
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
+
+ if result:
+ st.success("Document processed successfully!")
+
+ # Store result in session state for display in the right column
+ st.session_state.current_result = result
+
+ # Add to processing history
+ history_item = {
+ "id": datetime.now().timestamp(),
+ "fileName": file_to_process.name,
+ "timestamp": datetime.now().isoformat(),
+ "result": result,
+ "useVision": use_vision
+ }
+
+ if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+
+ st.session_state.processing_history.append(history_item)
+
+ st.experimental_rerun()
+ else:
+ st.error("Failed to process document.")
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+
+ # Experiment instructions
+ experiment_content = """
+
Experiment Instructions
+
+
Step 1: Select a document and choose your options
+
Step 2: Process the document with the selected options
+
Step 3: Analyze the results in the panel on the right
+
Step 4: Try again with different settings (e.g., toggle vision model)
+
Step 5: Compare results between different runs
+
+ """
+ key_concept(experiment_content)
+
+ with col2:
+ # Results display
+ st.subheader("Step 3: View Results")
+
+ if 'current_result' in st.session_state and st.session_state.current_result:
+ result = st.session_state.current_result
+
+ # Display results in a tool container
+ result_html = f"""
+
Vision model used: {'Yes' if latest['useVision'] else 'No'}
+ """
+ tool_container(latest_html)
+
+ # History in expander
+ with st.expander("View Complete Processing History"):
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
+ st.markdown(f"""
+
You're comparing the same document processed with different models.
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.
Process at least two documents to enable side-by-side comparison. Try processing
+ the same document with and without the vision model to see the differences in OCR quality.
+ """
+ research_question(need_more_content)
+
+ # Analysis guide tab
+ with analyze_tab:
+ st.subheader("Analysis Guide")
+
+ st.markdown("""
+ ### How to Analyze OCR Results
+
+ When analyzing OCR results from historical documents, consider these key factors:
+
+ 1. **Text Accuracy**
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
+ - Assess recognition of period-specific typography and writing styles
+ - Evaluate handling of degraded or damaged text areas
+
+ 2. **Structure Preservation**
+ - Does the OCR maintain paragraph and section breaks?
+ - Are columns and tabular data correctly preserved?
+ - How well are page transitions handled?
+
+ 3. **Special Elements**
+ - Recognition of footnotes, marginalia, and annotations
+ - Handling of illustrations, diagrams, and decorative elements
+ - Treatment of watermarks, signatures, and stamps
+
+ 4. **Metadata Extraction**
+ - Accuracy of detected languages, topics, and document type
+ - Identification of dates, names, and key entities
+ - Recognition of document purpose and context
+ """)
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ challenge_content = """
+
Common OCR Challenges
+
+
Typography Variations: Historical fonts that differ from modern text
+
Material Degradation: Fading, stains, tears affecting legibility
+
Handwritten Elements: Marginalia, signatures, and annotations
+
Complex Layouts: Multi-column formats and decorative elements
+
Language and Terminology: Archaic terms and multilingual content
Contextual Reading: Use context to interpret unclear passages
+
Error Patterns: Identify and correct systematic OCR errors
+
Hybrid Analysis: Combine OCR search with close reading
+
Comparative Processing: Try different settings on documents
+
Iterative Refinement: Use insights to improve future processing
+
+ """
+ gray_container(tips_content)
+
+ # Show example analysis if there's processing history
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
+ with st.expander("Example Analysis from Your Documents"):
+ # Pick the latest document
+ latest = st.session_state.processing_history[-1]
+
+ st.markdown(f"""
+ #### Sample Analysis for: {latest['fileName']}
+
+ **Document Context:**
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
+
+ **What to Look For:**
+ 1. Check how well the model identified key topics and languages
+ 2. Evaluate the completeness of extracted text
+ 3. Note any systematic errors in text recognition
+ 4. Assess how well document structure was preserved
+ """)
\ No newline at end of file
diff --git a/modules/content/module6.py b/modules/content/module6.py
new file mode 100644
index 0000000000000000000000000000000000000000..5efa8ee87b1f737172445d3f7caa20dd79c588b4
--- /dev/null
+++ b/modules/content/module6.py
@@ -0,0 +1,154 @@
+import streamlit as st
+from layout import gray_container, key_concept, quote, tool_container
+from datetime import datetime
+
+def render():
+ """Module 6: Conclusion and Future Directions"""
+
+ st.title("Module 6: Conclusion and Future Directions")
+
+ col1, col2 = st.columns([3, 2])
+
+ with col1:
+ summary_content = """
+
Workshop Summary
+
Throughout this workshop, we've explored:
+
+
Text-Image Interdependence: The complex relationship between textual and visual elements
+
OCR Technology: The evolution of OCR and its application to historical materials
+
Methodological Approaches: Hybrid strategies for working with historical texts
+
Practical Application: Hands-on experience with OCR processing tools
+ """
+ gray_container(research_content)
+
+ # Inspiring quote
+ quote_content = "The digital humanities are not about building, they're about sharing. The digital humanities are not about the digital at all. They're all about innovation and disruption. The digital humanities are really an insurgent humanities."
+ quote(quote_content, "Matthew Kirschenbaum, Professor of Digital Humanities")
+
+ # Additional resources
+ resources_content = """
+
This workshop was designed as an educational resource for historians, archivists, and digital humanities scholars.
+
It demonstrates the integration of modern AI vision-language models with historical research methodologies.
+
Special thanks to the digital humanities community for continued innovation in computational approaches to historical research.
+ """
+ st.markdown(acknowledgment_content, unsafe_allow_html=True)
+
+ # Restart the workshop button
+ if st.button("Start Workshop Again", use_container_width=True):
+ # Reset the session state to start the workshop again
+ if 'current_module' in st.session_state:
+ st.session_state.current_module = 1
+
+ # Do not reset the processing history
+
+ st.experimental_rerun()
\ No newline at end of file
diff --git a/modules/educational_module.py b/modules/educational_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc50f5e03de8e234a6c3369da6b68ce4a0faee0
--- /dev/null
+++ b/modules/educational_module.py
@@ -0,0 +1,547 @@
+import streamlit as st
+import io
+import tempfile
+from pathlib import Path
+from datetime import datetime
+from layout import tool_container, key_concept, research_question, upload_container
+import sys
+
+# Import the necessary modules for OCR processing
+sys.path.append(str(Path(__file__).parent.parent))
+try:
+ from process_file import process_file as process_file_util
+ process_file = process_file_util
+except ImportError:
+ # Fallback if process_file is not available
+ def process_file(uploaded_file, use_vision=True, custom_prompt=None):
+ """Fallback function for processing files"""
+ st.warning("Using mock processing function. Real OCR functionality is not available.")
+ return {
+ "file_name": uploaded_file.name,
+ "languages": ["English"],
+ "topics": ["History", "Document"],
+ "ocr_contents": {
+ "content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
+ }
+ }
+
+def render():
+ """Module 5: Interactive OCR Experiment"""
+
+ st.title("Module 5: Interactive OCR Experiment")
+
+ # Introduction to the interactive experiment
+ intro_content = """
+
Interactive OCR Experiment
+
+ This interactive experiment allows you to process historical documents with OCR and analyze the results.
+ Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
+
+ """
+ st.markdown(intro_content, unsafe_allow_html=True)
+
+ # Create tabs for different activities
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
+
+ # Try to import PDF tools if available
+ try:
+ from pdf2image import convert_from_bytes
+ pdf_support = True
+ except ImportError:
+ pdf_support = False
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
+
+ with experiment_tab:
+ # Create a two-column layout
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ # Tool container for document selection and options
+ st.subheader("Step 1: Select Document & Options")
+
+ # Processing options
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="Use the vision model for improved analysis")
+
+ # Additional prompt
+ st.markdown("### Custom Research Prompt (Optional)")
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
+ Focus on specific aspects of historical research you're interested in.""")
+ custom_prompt = st.text_area("Research Prompt",
+ placeholder="E.g., Focus on identifying dates and historical figures...",
+ help="Optional instructions to guide the analysis")
+
+ # Sample document selection
+ input_dir = Path(__file__).parent.parent / "input"
+
+ if input_dir.exists():
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
+
+ if sample_files:
+ st.markdown("#### Sample Documents")
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
+ sample_choice = st.selectbox("Choose a document:", sample_options)
+
+ if sample_choice != "Upload my own document":
+ # Process the selected sample file
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
+
+ if selected_file:
+ # Store the selected sample file in session state
+ with open(selected_file, "rb") as f:
+ file_bytes = f.read()
+
+ st.session_state.sample_file = {
+ "name": selected_file.name,
+ "bytes": file_bytes
+ }
+
+ # Preview the selected sample
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
+ try:
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"PDF selected: {selected_file.name}")
+ else:
+ # For images display directly
+ try:
+ from PIL import Image
+ img = Image.open(io.BytesIO(file_bytes))
+ st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"Selected: {selected_file.name}")
+ else:
+ # Clear the sample file if "Upload my own" is selected
+ if 'sample_file' in st.session_state:
+ del st.session_state.sample_file
+
+ # Display file uploader
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is not None:
+ # Display preview of the uploaded file
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ else:
+ # No sample files, just show the uploader
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is not None:
+ # Display the file preview
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ else:
+ # No input directory
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ # Process button
+ st.subheader("Step 2: Process the Document")
+
+ # Get the file to process (either uploaded or sample)
+ file_to_process = None
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
+ # Create a FileUploader-like object from the sample file
+ class SampleFileObject:
+ def __init__(self, name, data):
+ self.name = name
+ self._data = data
+
+ def getvalue(self):
+ return self._data
+
+ file_to_process = SampleFileObject(
+ st.session_state.sample_file["name"],
+ st.session_state.sample_file["bytes"]
+ )
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
+ file_to_process = uploaded_file
+
+ # Process button
+ process_button = st.button(
+ "Process Document",
+ disabled=file_to_process is None,
+ use_container_width=True
+ )
+
+ if process_button and file_to_process is not None:
+ with st.spinner("Processing document..."):
+ try:
+ # Process the file
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
+
+ if result:
+ st.success("Document processed successfully!")
+
+ # Store result in session state for display in the right column
+ st.session_state.current_result = result
+
+ # Add to processing history
+ history_item = {
+ "id": datetime.now().timestamp(),
+ "fileName": file_to_process.name,
+ "timestamp": datetime.now().isoformat(),
+ "result": result,
+ "useVision": use_vision
+ }
+
+ if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+
+ st.session_state.processing_history.append(history_item)
+
+ st.experimental_rerun()
+ else:
+ st.error("Failed to process document.")
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+
+ # Experiment instructions
+ experiment_content = """
+
Experiment Instructions
+
+
Step 1: Select a document and choose your options
+
Step 2: Process the document with the selected options
+
Step 3: Analyze the results in the panel on the right
+
Step 4: Try again with different settings (e.g., toggle vision model)
+
Step 5: Compare results between different runs
+
+ """
+ key_concept(experiment_content)
+
+ with col2:
+ # Results display
+ st.subheader("Step 3: View Results")
+
+ if 'current_result' in st.session_state and st.session_state.current_result:
+ result = st.session_state.current_result
+
+ # Display results in a tool container
+ result_html = f"""
+
Vision model used: {'Yes' if latest['useVision'] else 'No'}
+ """
+ tool_container(latest_html)
+
+ # History in expander
+ with st.expander("View Complete Processing History"):
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
+ st.markdown(f"""
+
You're comparing the same document processed with different models.
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.
Process at least two documents to enable side-by-side comparison. Try processing
+ the same document with and without the vision model to see the differences in OCR quality.
+ """
+ research_question(need_more_content)
+
+ # Analysis guide tab
+ with analyze_tab:
+ st.subheader("Analysis Guide")
+
+ st.markdown("""
+ ### How to Analyze OCR Results
+
+ When analyzing OCR results from historical documents, consider these key factors:
+
+ 1. **Text Accuracy**
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
+ - Assess recognition of period-specific typography and writing styles
+ - Evaluate handling of degraded or damaged text areas
+
+ 2. **Structure Preservation**
+ - Does the OCR maintain paragraph and section breaks?
+ - Are columns and tabular data correctly preserved?
+ - How well are page transitions handled?
+
+ 3. **Special Elements**
+ - Recognition of footnotes, marginalia, and annotations
+ - Handling of illustrations, diagrams, and decorative elements
+ - Treatment of watermarks, signatures, and stamps
+
+ 4. **Metadata Extraction**
+ - Accuracy of detected languages, topics, and document type
+ - Identification of dates, names, and key entities
+ - Recognition of document purpose and context
+ """)
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ challenge_content = """
+
Common OCR Challenges
+
+
Typography Variations: Historical fonts that differ from modern text
+
Material Degradation: Fading, stains, tears affecting legibility
+
Handwritten Elements: Marginalia, signatures, and annotations
+
Complex Layouts: Multi-column formats and decorative elements
+
Language and Terminology: Archaic terms and multilingual content
Contextual Reading: Use context to interpret unclear passages
+
Error Patterns: Identify and correct systematic OCR errors
+
Hybrid Analysis: Combine OCR search with close reading
+
Comparative Processing: Try different settings on documents
+
Iterative Refinement: Use insights to improve future processing
+
+ """
+ gray_container(tips_content)
+
+ # Show example analysis if there's processing history
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
+ with st.expander("Example Analysis from Your Documents"):
+ # Pick the latest document
+ latest = st.session_state.processing_history[-1]
+
+ st.markdown(f"""
+ #### Sample Analysis for: {latest['fileName']}
+
+ **Document Context:**
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
+
+ **What to Look For:**
+ 1. Check how well the model identified key topics and languages
+ 2. Evaluate the completeness of extracted text
+ 3. Note any systematic errors in text recognition
+ 4. Assess how well document structure was preserved
+ """)
\ No newline at end of file
diff --git a/modules/modular_app.py b/modules/modular_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..b18cbb21b315e8cb8e42af4cb433afeceed2ea97
--- /dev/null
+++ b/modules/modular_app.py
@@ -0,0 +1,276 @@
+import streamlit as st
+from pathlib import Path
+import sys
+from layout import page_wrapper
+from modules import get_module, get_module_name, module_names
+
+# Set page configuration with dark theme
+st.set_page_config(
+ page_title="Historical OCR Workshop",
+ page_icon="📜",
+ layout="wide",
+ initial_sidebar_state="collapsed"
+)
+
+# Initialize session state for workshop navigation
+if 'current_module' not in st.session_state:
+ st.session_state.current_module = 1
+
+if 'workshop_started' not in st.session_state:
+ st.session_state.workshop_started = False
+
+if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+
+def navigate_to_module(module_number):
+ """Navigate to a specific module"""
+ st.session_state.current_module = module_number
+ st.rerun()
+
+# Welcome screen if workshop hasn't been started
+if not st.session_state.workshop_started:
+ def welcome_screen():
+ """Renders the welcome/start screen"""
+ # Hero section with eye-catching design
+ st.markdown("""
+
+
Historical OCR Workshop
+
Unlock the potential of historical documents with modern OCR technology
+
+ This interactive workshop explores the application of OCR technology to historical documents,
+ combining theoretical understanding with practical experiences. Designed for historians,
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
+
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
+ Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for
+ historical research and digital humanities.
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ # Add an engaging research question
+ st.markdown("""
+
+
For Historians:
+ How might OCR technology transform our access to and interpretation of historical documents?
+ What new research questions become possible when large archives become machine-readable?
+
+ """, unsafe_allow_html=True)
+
+ # Display a sample historical document image
+ input_dir = Path(__file__).parent / "input"
+ sample_path = input_dir / "magellan-travels.jpg"
+ if sample_path.exists():
+ try:
+ from PIL import Image
+ with Image.open(sample_path) as img:
+ st.image(img, caption="Sample Historical Document", width=300)
+ except Exception:
+ pass
+
+ # What you'll learn section with visual learning outcomes
+ st.markdown('
What You\'ll Learn
', unsafe_allow_html=True)
+
+ # Create three columns for clean layout
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ st.markdown("""
+
+
Conceptual Understanding
+
+ - Text-image relationships in historical documents
+ - Evolution of OCR technology
+ - AI vision models for document analysis
+ - Historical typography challenges
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ st.markdown("""
+
+
Methodological Approaches
+
+ - Critical frameworks for OCR in historical research
+ - Hybrid computational-traditional methods
+ - Error analysis and interpretation
+ - Contextual reading strategies
+
+ """, unsafe_allow_html=True)
+
+ with col3:
+ st.markdown("""
+
+
Practical Skills
+
+ - Processing historical documents with OCR
+ - Analyzing and structuring extracted information
+ - Integrating OCR into research workflows
+ - Building searchable archives
+
+ "The digital turn in historical research is not just about converting analog to digital;
+ it's about transforming how we access, analyze, and interpret the past."
+
+ — Dr. Jane Winters, Professor of Digital Humanities
+
", unsafe_allow_html=True)
+
+ for i, name in enumerate(module_names, 1):
+ btn_style = "primary" if i == current_module else "secondary"
+ if st.button(f"{i}: {name}", key=f"nav_module_{i}", type=btn_style, use_container_width=True):
+ st.session_state.current_module = i
+ st.rerun()
+
+ # About the workshop in a collapsible section
+ with st.expander("About the Workshop"):
+ st.markdown("""
+ This interactive workshop explores OCR technology for historical documents.
+
+ **How to use this workshop:**
+ 1. Navigate through modules sequentially
+ 2. Expand content sections to read more
+ 3. Try the interactive OCR experiment
+ 4. Reflect on research questions
+
+ For help or more information, use the reference materials in Module 6.
+ """)
+
+ # Processing history if available
+ if st.session_state.processing_history:
+ with st.expander("Your Activity"):
+ st.markdown(f"Documents processed: {len(st.session_state.processing_history)}", unsafe_allow_html=True)
+
+ # Show the most recent document processed
+ latest = st.session_state.processing_history[-1]
+ st.markdown(f"""
+
+ Latest document: {latest['fileName']}
+ Processed with {' vision model' if latest['useVision'] else ' basic OCR'}
+
+ """, unsafe_allow_html=True)
+
+ # Render the current module content using the page wrapper
+ page_wrapper(module.render, current_module)
+
+# At the bottom of the page, create the hidden navigation buttons for the fixed navigation bar
+if st.session_state.workshop_started:
+ # Previous navigation button (hidden, activated by the fixed nav)
+ if st.session_state.current_module > 1:
+ if st.button("←", key=f"nav_prev_{st.session_state.current_module-1}", label_visibility="collapsed"):
+ st.session_state.current_module -= 1
+ st.rerun()
+
+ # Next navigation button (hidden, activated by the fixed nav)
+ if st.session_state.current_module < 6:
+ if st.button("→", key=f"nav_next_{st.session_state.current_module+1}", label_visibility="collapsed"):
+ st.session_state.current_module += 1
+ st.rerun()
+
+ # Module navigation dots (hidden, activated by the fixed nav)
+ for i in range(1, 7):
+ if st.button(f"{i}", key=f"nav_dot_{i}", label_visibility="collapsed"):
+ st.session_state.current_module = i
+ st.rerun()
\ No newline at end of file
diff --git a/ocr_processing.py b/ocr_processing.py
deleted file mode 100644
index 50faef5d589e5d3400177fc41bc3295248effbad..0000000000000000000000000000000000000000
--- a/ocr_processing.py
+++ /dev/null
@@ -1,665 +0,0 @@
-# Standard library imports
-import os
-import hashlib
-import tempfile
-import logging
-import time
-from datetime import datetime
-from pathlib import Path
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-# Third-party imports
-import streamlit as st
-
-# Local application imports
-from structured_ocr import StructuredOCR
-# Import from updated utils directory
-from utils.image_utils import clean_ocr_result
-# Temporarily retain old utils imports until they are fully migrated
-from utils import generate_cache_key, timing, format_timestamp, create_descriptive_filename, extract_subject_tags
-import preprocessing
-from error_handler import handle_ocr_error, check_file_size
-from image_segmentation import segment_image_for_ocr, process_segmented_image
-
-@st.cache_data(ttl=24*3600, max_entries=20, show_spinner=False)
-def process_file_cached(file_path, file_type, use_vision, file_size_mb, cache_key, preprocessing_options_hash=None, custom_prompt=None):
- """
- Cached version of OCR processing to reuse results
-
- Args:
- file_path: Path to the file to process
- file_type: Type of file (pdf or image)
- use_vision: Whether to use vision model
- file_size_mb: File size in MB
- cache_key: Cache key for the file
- preprocessing_options_hash: Hash of preprocessing options
- custom_prompt: Custom prompt to use for OCR
-
- Returns:
- dict: OCR result
- """
- # Initialize OCR processor
- processor = StructuredOCR()
-
- # Process the file
- with timing(f"OCR processing of {file_type} file"):
- result = processor.process_file(
- file_path,
- file_type=file_type,
- use_vision=use_vision,
- file_size_mb=file_size_mb,
- custom_prompt=custom_prompt
- )
-
- return result
-
-def process_file(uploaded_file, use_vision=True, preprocessing_options=None, progress_reporter=None,
- pdf_dpi=150, max_pages=3, pdf_rotation=0, custom_prompt=None, perf_mode="Quality",
- use_segmentation=False):
- """
- Process the uploaded file and return the OCR results
-
- Args:
- uploaded_file: The uploaded file to process
- use_vision: Whether to use vision model
- preprocessing_options: Dictionary of preprocessing options
- progress_reporter: ProgressReporter instance for UI updates
- pdf_dpi: DPI for PDF conversion
- max_pages: Maximum number of pages to process
- pdf_rotation: PDF rotation value
- custom_prompt: Custom prompt for OCR
- perf_mode: Performance mode (Quality or Speed)
-
- Returns:
- dict: OCR result
- """
- if preprocessing_options is None:
- preprocessing_options = {}
-
- # Create a container for progress indicators if not provided
- if progress_reporter is None:
- from ui.ui_components import ProgressReporter
- progress_reporter = ProgressReporter(st.empty()).setup()
-
- # Initialize temporary file paths list
- temp_file_paths = []
-
- # Also track temporary files in session state for reliable cleanup
- if 'temp_file_paths' not in st.session_state:
- st.session_state.temp_file_paths = []
-
- try:
- # Check if file size exceeds maximum allowed size
- is_valid, file_size_mb, error_message = check_file_size(uploaded_file.getvalue())
- if not is_valid:
- progress_reporter.complete(success=False)
- st.error(error_message)
- return {
- "file_name": uploaded_file.name,
- "topics": ["Document"],
- "languages": ["English"],
- "error": error_message,
- "ocr_contents": {
- "error": error_message,
- "partial_text": "Document could not be processed due to size limitations."
- }
- }
-
- # Update progress
- progress_reporter.update(10, "Initializing OCR processor...")
-
- # Determine file type from extension
- file_ext = Path(uploaded_file.name).suffix.lower()
- file_type = "pdf" if file_ext == ".pdf" else "image"
- file_bytes = uploaded_file.getvalue()
-
- # For PDFs, we need to handle differently
- if file_type == "pdf":
- progress_reporter.update(20, "Preparing PDF document...")
-
- # Create a temporary file for processing
- temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=file_ext).name
- with open(temp_path, 'wb') as f:
- f.write(file_bytes)
- temp_file_paths.append(temp_path)
-
- # Track temp files in session state for reliable cleanup
- if temp_path not in st.session_state.temp_file_paths:
- st.session_state.temp_file_paths.append(temp_path)
- logger.info(f"Added temp file to session state: {temp_path}")
-
- # Generate cache key
- cache_key = generate_cache_key(
- file_bytes,
- file_type,
- use_vision,
- preprocessing_options,
- pdf_rotation,
- custom_prompt
- )
-
- # Use the document type information from preprocessing options
- doc_type = preprocessing_options.get("document_type", "standard")
- modified_custom_prompt = custom_prompt
-
- # Enhance the prompt with document-type specific instructions
- # Check for letterhead/marginalia document types with specialized handling
- try:
- from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
- # Extract text density features if available
- features = None
- if 'text_density' in preprocessing_options:
- features = preprocessing_options['text_density']
-
- # Check if this looks like a letterhead document
- if is_likely_letterhead(temp_path, features):
- # Get specialized letterhead prompt
- letterhead_prompt = get_letterhead_prompt(temp_path, features)
- if letterhead_prompt:
- logger.info(f"Using specialized letterhead prompt for document")
- modified_custom_prompt = letterhead_prompt
- # Set document type for tracking
- preprocessing_options["document_type"] = "letterhead"
- doc_type = "letterhead"
- except ImportError:
- logger.debug("Letterhead handler not available")
-
- # Add document-type specific instructions based on preprocessing options
- if doc_type == "handwritten" and not modified_custom_prompt:
- modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
- elif doc_type == "handwritten" and "handwritten" not in modified_custom_prompt.lower():
- modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
- elif doc_type == "newspaper" and not modified_custom_prompt:
- modified_custom_prompt = "This is a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
- elif doc_type == "newspaper" and "column" not in modified_custom_prompt.lower() and "newspaper" not in modified_custom_prompt.lower():
- modified_custom_prompt += " This appears to be a newspaper or document with columns. Please extract all text content from each column."
- elif doc_type == "book" and not modified_custom_prompt:
- modified_custom_prompt = "This is a book page. Extract titles, headers, footnotes, and body text, preserving paragraph structure and formatting."
-
- # Update the cache key with the modified prompt
- if modified_custom_prompt != custom_prompt:
- cache_key = generate_cache_key(
- open(temp_path, 'rb').read(),
- file_type,
- use_vision,
- preprocessing_options,
- pdf_rotation,
- modified_custom_prompt
- )
-
- progress_reporter.update(30, "Processing PDF with enhanced OCR...")
-
- # Process with cached function if possible
- try:
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key,
- str(preprocessing_options), modified_custom_prompt)
- progress_reporter.update(90, "Finalizing results...")
- except Exception as e:
- logger.warning(f"Cached processing failed: {str(e)}. Using direct processing.")
- progress_reporter.update(60, f"Processing error: {str(e)}. Using enhanced PDF processor...")
-
- # Import the enhanced PDF processor
- try:
- from utils.pdf_ocr import PDFOCR
-
- # Use our specialized PDF processor
- pdf_processor = PDFOCR()
-
- # Process with the enhanced PDF processor
- result = pdf_processor.process_pdf(
- pdf_path=temp_path,
- use_vision=use_vision,
- max_pages=max_pages,
- custom_prompt=modified_custom_prompt
- )
-
- logger.info("PDF successfully processed with enhanced PDF processor")
- progress_reporter.update(90, "Finalizing results...")
- except ImportError:
- logger.warning("Enhanced PDF processor not available. Falling back to standard processing.")
- progress_reporter.update(70, "Falling back to standard PDF processing...")
-
- # If enhanced processor is not available, fall back to direct StructuredOCR processing
- processor = StructuredOCR()
- result = processor.process_file(
- file_path=temp_path,
- file_type="pdf",
- use_vision=use_vision,
- custom_prompt=modified_custom_prompt,
- file_size_mb=file_size_mb,
- max_pages=max_pages
- )
- progress_reporter.update(90, "Finalizing results...")
- else:
- # For image files
- progress_reporter.update(20, "Preparing image for processing...")
-
- # Apply preprocessing if needed
- temp_path, preprocessing_applied = preprocessing.apply_preprocessing_to_file(
- file_bytes,
- file_ext,
- preprocessing_options,
- temp_file_paths
- )
-
- if preprocessing_applied:
- progress_reporter.update(30, "Applied image preprocessing...")
-
- # Apply image segmentation if requested
- # This is especially helpful for complex documents with mixed text and images
- if use_segmentation:
- progress_reporter.update(35, "Applying image segmentation to separate text and image regions...")
-
- try:
- # Perform image segmentation with content preservation if requested
- preserve_content = preprocessing_options.get("preserve_content", True)
- segmentation_results = segment_image_for_ocr(
- temp_path,
- vision_enabled=use_vision,
- preserve_content=preserve_content
- )
-
- if segmentation_results['combined_result'] is not None:
- # Save the segmented result to a new temporary file
- segmented_temp_path = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg').name
- segmentation_results['combined_result'].save(segmented_temp_path)
- temp_file_paths.append(segmented_temp_path)
-
- # Check if we have individual region images to process separately
- if 'region_images' in segmentation_results and segmentation_results['region_images']:
- # Process each region separately for better results
- regions_count = len(segmentation_results['region_images'])
- logger.info(f"Processing {regions_count} text regions individually")
- progress_reporter.update(40, f"Processing {regions_count} text regions separately...")
-
- # Initialize StructuredOCR processor
- processor = StructuredOCR()
-
- # Store individual region results
- region_results = []
-
- # Process each region individually
- for idx, region_info in enumerate(segmentation_results['region_images']):
- # Save region image to temp file
- region_temp_path = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg').name
- region_info['pil_image'].save(region_temp_path)
- temp_file_paths.append(region_temp_path)
-
- # Create region-specific prompt
- region_prompt = f"This is region {idx+1} of {regions_count} from a segmented document. Extract all visible text precisely, preserving line breaks and structure."
-
- # Process the region
- try:
- region_result = processor.process_file(
- file_path=region_temp_path,
- file_type="image",
- use_vision=use_vision,
- custom_prompt=region_prompt,
- file_size_mb=None
- )
-
- # Store result with region info
- if 'ocr_contents' in region_result and 'raw_text' in region_result['ocr_contents']:
- region_results.append({
- 'text': region_result['ocr_contents']['raw_text'],
- 'coordinates': region_info['coordinates'],
- 'order': region_info['order']
- })
- except Exception as region_err:
- logger.warning(f"Error processing region {idx+1}: {str(region_err)}")
-
- # Sort regions by their order for correct reading flow
- region_results.sort(key=lambda x: x['order'])
-
- # Import the text utilities for intelligent merging
- try:
- from utils.text_utils import merge_region_texts
- # Use intelligent merging to avoid duplication in overlapped regions
- combined_text = merge_region_texts(region_results)
- logger.info("Using intelligent text merging for overlapping regions")
- except ImportError:
- # Fallback to simple joining if import fails
- combined_text = "\n\n".join([r['text'] for r in region_results if r['text'].strip()])
- logger.warning("Using simple text joining (utils.text_utils not available)")
-
- # Store combined results for later use
- preprocessing_options['segmentation_data'] = {
- 'text_regions_coordinates': segmentation_results.get('text_regions_coordinates', []),
- 'regions_count': regions_count,
- 'segmentation_applied': True,
- 'combined_text': combined_text,
- 'region_results': region_results
- }
-
- logger.info(f"Successfully processed {len(region_results)} text regions")
-
- # Set up the temp path to use the segmented image
- temp_path = segmented_temp_path
-
- # IMPORTANT: We've already extracted text from individual regions,
- # emphasize their importance in our prompt
- if custom_prompt:
- # Add strong emphasis on using the already extracted text
- custom_prompt += f" IMPORTANT: The document has been segmented into {regions_count} text regions that have been processed individually. The text from these regions should be given HIGHEST PRIORITY and used as the primary source of text for the document. The combined image is provided only as supplementary context."
- else:
- # Create explicit prompt prioritizing region text
- custom_prompt = f"CRITICAL: This document has been preprocessed to highlight {regions_count} text regions that have been individually processed. The text from these regions is the PRIMARY source of content and should be prioritized over any text extracted from the combined image. Use the combined image only for context and layout understanding."
- else:
- # No individual regions found, use combined result
- temp_path = segmented_temp_path
-
- # Enhanced prompt based on segmentation results
- regions_count = len(segmentation_results.get('text_regions_coordinates', []))
- if custom_prompt:
- # Add segmentation info to existing prompt
- custom_prompt += f" The document has been segmented and contains approximately {regions_count} text regions that should be carefully extracted. Please focus on extracting all text from these regions."
- else:
- # Create new prompt focused on text extraction from segmented regions
- custom_prompt = f"This document has been preprocessed to highlight {regions_count} text regions. Please carefully extract all text from these highlighted regions, preserving the reading order and structure."
-
- # Store segmentation data in preprocessing options for later use
- preprocessing_options['segmentation_data'] = {
- 'text_regions_coordinates': segmentation_results.get('text_regions_coordinates', []),
- 'regions_count': regions_count,
- 'segmentation_applied': True
- }
-
- logger.info(f"Image segmentation applied. Found {len(segmentation_results.get('text_regions_coordinates', []))} text regions.")
- progress_reporter.update(40, f"Identified {len(segmentation_results.get('text_regions_coordinates', []))} text regions for extraction...")
- else:
- logger.warning("Image segmentation produced no result, using original image.")
- except Exception as seg_error:
- logger.warning(f"Image segmentation failed: {str(seg_error)}. Continuing with standard processing.")
-
- # Generate cache key
- cache_key = generate_cache_key(
- open(temp_path, 'rb').read(),
- file_type,
- use_vision,
- preprocessing_options,
- 0, # No rotation for images (handled in preprocessing)
- custom_prompt
- )
-
- # Process the file using cached function if possible
- progress_reporter.update(50, "Processing document with OCR...")
- try:
- # Use the document type from preprocessing options
- doc_type = preprocessing_options.get("document_type", "standard")
- modified_custom_prompt = custom_prompt
-
- # Check for letterhead/marginalia document types with specialized handling
- try:
- from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
- # Extract text density features if available
- features = None
- if 'text_density' in preprocessing_options:
- features = preprocessing_options['text_density']
-
- # Check if this looks like a letterhead document
- if is_likely_letterhead(temp_path, features):
- # Get specialized letterhead prompt
- letterhead_prompt = get_letterhead_prompt(temp_path, features)
- if letterhead_prompt:
- logger.info(f"Using specialized letterhead prompt for document")
- modified_custom_prompt = letterhead_prompt
- # Set document type for tracking
- preprocessing_options["document_type"] = "letterhead"
- doc_type = "letterhead"
- except ImportError:
- logger.debug("Letterhead handler not available")
-
- # Add document-type specific instructions based on preprocessing options
- if doc_type == "handwritten" and not modified_custom_prompt:
- modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
- elif doc_type == "handwritten" and "handwritten" not in modified_custom_prompt.lower():
- modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
- elif doc_type == "newspaper" and not modified_custom_prompt:
- modified_custom_prompt = "This is a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
- elif doc_type == "newspaper" and "column" not in modified_custom_prompt.lower() and "newspaper" not in modified_custom_prompt.lower():
- modified_custom_prompt += " This appears to be a newspaper or document with columns. Please extract all text content from each column."
- elif doc_type == "book" and not modified_custom_prompt:
- modified_custom_prompt = "This is a book page. Extract titles, headers, footnotes, and body text, preserving paragraph structure and formatting."
-
- # Update the cache key with the modified prompt
- if modified_custom_prompt != custom_prompt:
- cache_key = generate_cache_key(
- open(temp_path, 'rb').read(),
- file_type,
- use_vision,
- preprocessing_options,
- 0,
- modified_custom_prompt
- )
-
- result = process_file_cached(temp_path, file_type, use_vision, file_size_mb, cache_key, str(preprocessing_options), modified_custom_prompt)
- progress_reporter.update(80, "Analyzing document structure...")
- progress_reporter.update(90, "Finalizing results...")
- except Exception as e:
- logger.warning(f"Cached processing failed: {str(e)}. Retrying with direct processing.")
- progress_reporter.update(60, f"Processing error: {str(e)}. Retrying...")
-
- # If caching fails, process directly
- processor = StructuredOCR()
-
- # Apply performance mode settings
- if perf_mode == "Speed":
- # Use simpler processing for speed
- pass # Any speed optimizations would be handled by the StructuredOCR class
-
- # Use the document type from preprocessing options
- doc_type = preprocessing_options.get("document_type", "standard")
- modified_custom_prompt = custom_prompt
-
- # Check for letterhead/marginalia document types with specialized handling
- try:
- from utils.helpers.letterhead_handler import get_letterhead_prompt, is_likely_letterhead
- # Extract text density features if available
- features = None
- if 'text_density' in preprocessing_options:
- features = preprocessing_options['text_density']
-
- # Check if this looks like a letterhead document
- if is_likely_letterhead(temp_path, features):
- # Get specialized letterhead prompt
- letterhead_prompt = get_letterhead_prompt(temp_path, features)
- if letterhead_prompt:
- logger.info(f"Using specialized letterhead prompt for document")
- modified_custom_prompt = letterhead_prompt
- # Set document type for tracking
- preprocessing_options["document_type"] = "letterhead"
- doc_type = "letterhead"
- except ImportError:
- logger.debug("Letterhead handler not available")
-
- # Add document-type specific instructions based on preprocessing options
- if doc_type == "handwritten" and not modified_custom_prompt:
- modified_custom_prompt = "This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
- elif doc_type == "handwritten" and "handwritten" not in modified_custom_prompt.lower():
- modified_custom_prompt += " This is a handwritten document. Please carefully transcribe all handwritten text, preserving line breaks and original formatting."
- elif doc_type == "newspaper" and not modified_custom_prompt:
- modified_custom_prompt = "This is a newspaper or document with columns. Please extract all text content from each column, maintaining proper reading order."
- elif doc_type == "newspaper" and "column" not in modified_custom_prompt.lower() and "newspaper" not in modified_custom_prompt.lower():
- modified_custom_prompt += " This appears to be a newspaper or document with columns. Please extract all text content from each column."
- elif doc_type == "book" and not modified_custom_prompt:
- modified_custom_prompt = "This is a book page. Extract titles, headers, footnotes, and body text, preserving paragraph structure and formatting."
-
- result = processor.process_file(
- file_path=temp_path,
- file_type=file_type,
- use_vision=use_vision,
- custom_prompt=modified_custom_prompt,
- file_size_mb=file_size_mb
- )
-
- progress_reporter.update(90, "Finalizing results...")
-
- # Add additional metadata to result
- result = process_result(result, uploaded_file, preprocessing_options)
-
- # Make sure file_type is explicitly set for PDFs
- if file_type == "pdf":
- result['file_type'] = "pdf"
-
- # Check for duplicated text patterns that indicate handwritten text issues
- try:
- from utils.helpers.ocr_text_repair import detect_duplicate_text_issues, get_enhanced_preprocessing_options, get_handwritten_specific_prompt, clean_duplicated_text
-
- # Check OCR output for duplication issues
- if result and 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
- ocr_text = result['ocr_contents']['raw_text']
- has_duplication, duplication_details = detect_duplicate_text_issues(ocr_text)
-
- # If we detect significant duplication in the output
- if has_duplication and duplication_details.get('duplication_rate', 0) > 0.1:
- logger.info(f"Detected text duplication issues. Reprocessing as handwritten document with enhanced settings...")
- progress_reporter.update(75, "Detected duplication issues. Reprocessing with enhanced settings...")
-
- # Save original result before reprocessing
- original_result = result
-
- # Get enhanced preprocessing options for handwritten text
- enhanced_options = get_enhanced_preprocessing_options(preprocessing_options)
-
- # Reprocess with enhanced settings and specialized prompt
- handwritten_prompt = get_handwritten_specific_prompt(custom_prompt)
-
- # Process the image with the enhanced settings
- try:
- # Apply enhanced preprocessing to the original image
- enhanced_temp_path, _ = preprocessing.apply_preprocessing_to_file(
- open(temp_path, 'rb').read(),
- Path(temp_path).suffix.lower(),
- enhanced_options,
- temp_file_paths
- )
-
- # Process with enhanced settings
- processor = StructuredOCR()
- enhanced_result = processor.process_file(
- file_path=enhanced_temp_path,
- file_type="image",
- use_vision=use_vision,
- custom_prompt=handwritten_prompt,
- file_size_mb=file_size_mb
- )
-
- # Check if the enhanced result is better (less duplication)
- if 'ocr_contents' in enhanced_result and 'raw_text' in enhanced_result['ocr_contents']:
- enhanced_text = enhanced_result['ocr_contents']['raw_text']
- _, enhanced_issues = detect_duplicate_text_issues(enhanced_text)
-
- # Use the enhanced result if it's better
- if enhanced_issues.get('duplication_rate', 1.0) < duplication_details.get('duplication_rate', 1.0):
- logger.info("Enhanced processing improved OCR quality. Using enhanced result.")
- result = enhanced_result
- # Preserve document type and preprocessing info
- result['document_type'] = 'handwritten'
- result['preprocessing'] = enhanced_options
- else:
- # If enhancement didn't help, clean up the original result
- logger.info("Enhanced processing did not improve OCR quality. Cleaning original result.")
- result = original_result
- # Clean up duplication in the text
- if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
- result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
- else:
- # Fallback to original with cleaning
- logger.info("Enhanced processing failed. Cleaning original result.")
- result = original_result
- # Clean up duplication in the text
- if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
- result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
- except Exception as enh_error:
- logger.warning(f"Enhanced processing failed: {str(enh_error)}. Using cleaned original.")
- # Fallback to original with cleaning
- result = original_result
- # Clean up duplication in the text
- if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
- result['ocr_contents']['raw_text'] = clean_duplicated_text(result['ocr_contents']['raw_text'])
- except ImportError:
- logger.debug("OCR text repair module not available")
-
- # 🔧 ALWAYS normalize result before returning
- result = clean_ocr_result(
- result,
- use_segmentation=use_segmentation,
- vision_enabled=use_vision,
- preprocessing_options=preprocessing_options
- )
-
- # Complete progress
- progress_reporter.complete()
-
- return result
- except Exception as e:
- # Handle errors
- error_message = handle_ocr_error(e, progress_reporter)
-
- # Return error result
- return {
- "file_name": uploaded_file.name,
- "topics": ["Document"],
- "languages": ["English"],
- "error": error_message,
- "ocr_contents": {
- "error": f"Failed to process file: {error_message}",
- "partial_text": "Document could not be processed due to an error."
- }
- }
- finally:
- # Clean up temporary files
- for temp_path in temp_file_paths:
- try:
- if os.path.exists(temp_path):
- os.unlink(temp_path)
- logger.info(f"Removed temporary file: {temp_path}")
- except Exception as e:
- logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
-
-def process_result(result, uploaded_file, preprocessing_options=None):
- """
- Process OCR result to add metadata, tags, etc.
-
- Args:
- result: OCR result dictionary
- uploaded_file: The uploaded file
- preprocessing_options: Dictionary of preprocessing options
-
- Returns:
- dict: Processed OCR result
- """
- # Add timestamp
- result['timestamp'] = format_timestamp()
-
- # Add processing time if not already present
- if 'processing_time' not in result:
- result['processing_time'] = 0.0
-
- # Generate descriptive filename
- file_ext = Path(uploaded_file.name).suffix.lower()
- result['descriptive_file_name'] = create_descriptive_filename(
- uploaded_file.name,
- result,
- file_ext,
- preprocessing_options
- )
-
- # Extract raw text from OCR contents for tag extraction without duplicating content
- raw_text = ""
- if 'ocr_contents' in result:
- # Try fields in order of preference
- for field in ["raw_text", "content", "text", "transcript", "main_text"]:
- if field in result['ocr_contents'] and result['ocr_contents'][field]:
- raw_text = result['ocr_contents'][field]
- break
-
- # Extract subject tags if not already present or enhance existing ones
- if 'topics' not in result or not result['topics']:
- result['topics'] = extract_subject_tags(result, raw_text, preprocessing_options)
-
- return result
diff --git a/ocr_utils.py b/ocr_utils.py
index 7306cbd8228ddfb5c1ab6dbc8d84df80d187f2ea..ec27512dc1bf4b9576315571bc2d21893eb3f84a 100644
--- a/ocr_utils.py
+++ b/ocr_utils.py
@@ -1,38 +1,74 @@
"""
-OCR utility functions for image processing and OCR operations.
-This module provides helper functions used across the Historical OCR application.
+Utility functions for OCR processing with Mistral AI.
+Contains helper functions for working with OCR responses and image handling.
"""
-import os
+import json
import base64
-import logging
+import io
from pathlib import Path
-from typing import Union, Optional
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-# Try to import optional dependencies
-try:
- import pytesseract
- TESSERACT_AVAILABLE = True
-except ImportError:
- logger.warning("pytesseract not available - local OCR fallback will not work")
- TESSERACT_AVAILABLE = False
+from typing import Dict, List, Optional, Union, Any
try:
from PIL import Image
PILLOW_AVAILABLE = True
except ImportError:
- logger.warning("PIL not available - image preprocessing will be limited")
PILLOW_AVAILABLE = False
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+
+def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
+ """
+ Replace image placeholders in markdown with base64-encoded images.
+
+ Args:
+ markdown_str: Markdown text containing image placeholders
+ images_dict: Dictionary mapping image IDs to base64 strings
+
+ Returns:
+ Markdown text with images replaced by base64 data
+ """
+ for img_name, base64_str in images_dict.items():
+ markdown_str = markdown_str.replace(
+ f"", f""
+ )
+ return markdown_str
+
+def get_combined_markdown(ocr_response) -> str:
+ """
+ Combine OCR text and images into a single markdown document.
+ Ensures proper spacing between text and images.
+
+ Args:
+ ocr_response: Response from OCR processing containing text and images
+ See https://docs.mistral.ai/capabilities/document/ for API reference
+
+ Returns:
+ Combined markdown string with embedded images
+ """
+ markdowns: list[str] = []
+ # Extract images from page
+ for page in ocr_response.pages:
+ image_data = {}
+ for img in page.images:
+ image_data[img.id] = img.image_base64
+
+ # Replace image placeholders with actual images
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
+
+ # Ensure proper spacing between paragraphs and images
+ # Add extra newlines between paragraphs to improve rendering
+ page_markdown = page_markdown.replace("\n", "\n\n")
+
+ # Add page separator for multi-page documents
+ markdowns.append(page_markdown)
+
+ # Join pages with clear separators for multi-page documents
+ return "\n\n---\n\n".join(markdowns)
def encode_image_for_api(image_path: Union[str, Path]) -> str:
"""
- Encode an image as base64 data URL for API submission with proper MIME type.
+ Encode an image as base64 for API use.
Args:
image_path: Path to the image file
@@ -47,53 +83,130 @@ def encode_image_for_api(image_path: Union[str, Path]) -> str:
if not image_file.is_file():
raise FileNotFoundError(f"Image file not found: {image_file}")
- # Determine mime type based on file extension
- mime_type = 'image/jpeg' # Default mime type
- suffix = image_file.suffix.lower()
- if suffix == '.png':
- mime_type = 'image/png'
- elif suffix == '.gif':
- mime_type = 'image/gif'
- elif suffix in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif suffix == '.pdf':
- mime_type = 'application/pdf'
-
# Encode image as base64
encoded = base64.b64encode(image_file.read_bytes()).decode()
- return f"data:{mime_type};base64,{encoded}"
+ return f"data:image/jpeg;base64,{encoded}"
+def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
+ """
+ Process an image with OCR and return the response.
+
+ Args:
+ client: Mistral AI client
+ image_path: Path to the image file
+ model: OCR model to use
+
+ Returns:
+ OCR response object
+ """
+ # Encode image as base64
+ base64_data_url = encode_image_for_api(image_path)
+
+ # Process image with OCR
+ image_response = client.ocr.process(
+ document=ImageURLChunk(image_url=base64_data_url),
+ model=model
+ )
+
+ return image_response
-def try_local_ocr_fallback(file_path: Union[str, Path], base64_data_url: Optional[str] = None) -> Optional[str]:
+def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
"""
- Try to perform OCR using local Tesseract as a fallback when the API is unavailable.
+ Convert OCR response to a formatted JSON string.
Args:
- file_path: Path to the image file
- base64_data_url: Optional base64 data URL if already available
+ ocr_response: OCR response object
+ indent: Indentation level for JSON formatting
Returns:
- Extracted text or None if extraction failed
+ Formatted JSON string
"""
- if not TESSERACT_AVAILABLE or not PILLOW_AVAILABLE:
- logger.warning("Local OCR fallback is not available (missing dependencies)")
- return None
+ # Convert response to JSON
+ response_dict = json.loads(ocr_response.model_dump_json())
+ return json.dumps(response_dict, indent=indent)
+
+def get_combined_markdown_compressed(ocr_response, max_width: int = 800, quality: int = 85) -> str:
+ """
+ Combine OCR text and images into a single markdown document with compressed images.
+ Reduces image sizes to improve performance.
- try:
- logger.info("Using local Tesseract OCR as fallback")
+ Args:
+ ocr_response: Response from OCR processing containing text and images
+ max_width: Maximum width to resize images to (preserves aspect ratio)
+ quality: JPEG quality (0-100) for compression
+
+ Returns:
+ Combined markdown string with embedded compressed images
+ """
+ if not PILLOW_AVAILABLE:
+ # Fall back to regular method if PIL is not available
+ return get_combined_markdown(ocr_response)
+
+ markdowns: list[str] = []
+
+ # Process each page
+ for page in ocr_response.pages:
+ image_data = {}
+
+ # Process and compress each image
+ for img in page.images:
+ try:
+ # Decode base64 image
+ img_bytes = base64.b64decode(img.image_base64.split(',')[1] if ',' in img.image_base64 else img.image_base64)
+
+ # Open with PIL
+ pil_img = Image.open(io.BytesIO(img_bytes))
+
+ # Resize if needed (maintain aspect ratio)
+ original_width, original_height = pil_img.size
+ if original_width > max_width:
+ ratio = max_width / original_width
+ new_height = int(original_height * ratio)
+ pil_img = pil_img.resize((max_width, new_height), Image.LANCZOS)
+
+ # Convert to bytes with compression
+ buffer = io.BytesIO()
+ format = pil_img.format if pil_img.format else 'JPEG'
+ if format.upper() == 'JPEG' or format.upper() == 'JPG':
+ pil_img.save(buffer, format=format, quality=quality, optimize=True)
+ else:
+ # For non-JPEG formats (PNG, etc.)
+ pil_img.save(buffer, format=format, optimize=True)
+
+ # Convert back to base64
+ compressed_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+ mime_type = f"image/{format.lower()}" if format else "image/jpeg"
+ image_data[img.id] = f"data:{mime_type};base64,{compressed_base64}"
+
+ except Exception as e:
+ # If compression fails, use original image
+ image_data[img.id] = img.image_base64
+
+ # Replace image placeholders with compressed images
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
- # Use PIL to open the image
- img = Image.open(file_path)
+ # Ensure proper spacing between paragraphs and images
+ page_markdown = page_markdown.replace("\n", "\n\n")
- # Use Tesseract to extract text
- text = pytesseract.image_to_string(img)
+ # Add page to list
+ markdowns.append(page_markdown)
- if text:
- logger.info("Successfully extracted text using local Tesseract OCR")
- return text
- else:
- logger.warning("Tesseract extracted no text")
- return None
- except Exception as e:
- logger.error(f"Error using local OCR fallback: {str(e)}")
- return None
+ # Join pages with clear separators
+ return "\n\n---\n\n".join(markdowns)
+
+# For display in notebooks
+try:
+ from IPython.display import Markdown, display
+
+ def display_ocr_with_images(ocr_response):
+ """
+ Display OCR response with embedded images in IPython environments.
+
+ Args:
+ ocr_response: OCR response object
+ """
+ combined_markdown = get_combined_markdown(ocr_response)
+ display(Markdown(combined_markdown))
+except ImportError:
+ # IPython not available
+ pass
\ No newline at end of file
diff --git a/output/.gitkeep b/output/.gitkeep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/output/example-1.html b/output/example-1.html
new file mode 100644
index 0000000000000000000000000000000000000000..8d05265eff794c3a1bebe6505cf942ae249cae0b
--- /dev/null
+++ b/output/example-1.html
@@ -0,0 +1,448 @@
+Source file changed.
+
+Rerun
+Always rerun
+Deploy
+
+
+ Options
+
+
+ Model Settings
+
+Use Vision Model
+
+
+ Image Preprocessing
+
+Preprocessing Options
+
+Convert to Grayscale
+
+Apply Thresholding
+
+Denoise Image
+
+Adjust Contrast
+
+0
+-5
+5
+
+
+ PDF Options
+
+PDF Settings
+
+PDF Resolution (DPI)
+
+150
+72
+300
+
+Maximum Pages to Process
+
+
+ Historical Document OCR
+
+ <#historical-document-ocr>
+
+
+ Powered by Mistral AI
+
+ <#powered-by-mistral-ai>
+
+Document Processing
+
+About
+
+Upload an image or PDF file to get started.
+
+Using the latest |mistral-ocr-latest| model for advanced document understanding.
+
+Choose a file
+
+Drag and drop file here
+Limit 500MB per file • PDF, PNG, JPG, JPEG
+
+Browse files
+
+ *
+
+ ymca-letter.png
+ 4.4MB
+
+
+ Document Preview
+
+ <#document-preview>
+
+0
+
+Image Preprocessing Preview
+
+*Original Image*
+
+0
+
+*Preprocessed Image*
+
+0
+
+Process Document
+
+Document Analysis
+
+Technical Details
+
+
+ Document Metadata
+
+ <#document-metadata>
+
+*Document processed successfully*
+
+*File Name:* img-0.jpeg
+
+*Languages:* Chinese
+
+*OCR Confidence:* 85.0%
+
+*Topics:* Young Men's Christian Association, Board of Managers,
+Philadelphia, Central Branch
+
+
+ Document Contents
+
+ <#document-contents>
+
+Structured View
+
+Raw JSON
+
+With Images
+
+
+ Header
+
+ <#header>
+
+THE YOUNG MEN'S CHRISTIAN ASSOCIATION
+
+
+ Image
+
+ <#image>
+
+img-0.jpeg
+
+
+ Central Branch
+
+ <#central-branch>
+
+*address:* CENTRAL BRANCH
+
+*location:* 1421 ARCH STREET PHILADELPHIA 2, PENNA. LOCUST 3-8100
+
+*title:* of Philadelphia and Vicinity
+
+
+ Board Chairman
+
+ <#board-chairman>
+
+THOMAS E. WALTON, Jr.
+
+
+ Executive Secretary
+
+ <#executive-secretary>
+
+NORMAN I PULLER
+
+
+ Board Of Managers
+
+ <#board-of-managers>
+
+ * JOSEPH DELACY
+
+ * WALTER R. DUNCAN
+
+ * RALPH B. EATON
+
+ * ALBERT H. FENSTERMACHER
+
+ * AUSTIN J. GAUGEL
+
+ * JOSEPH H. GEIS, JR.
+
+ * ARTHUR R. GREMEL
+
+ * ELIOT B. HANSCOM
+
+ * CHARLES E. HANTHORN
+
+ * GEORGE IRELAND
+
+ * GEORGE L. KLEWER
+
+ * ALEXANDER H. RALSTON
+
+ * ARTHUR SAXON, JR.
+
+ * HARRY W. SCHOB, SR.
+
+ * WALTER J. SCHOB
+
+ * HARVEY T. STEPHENS
+
+ * CHARLES E. SWANSON
+
+ * LAWRENCE W. THOMASON
+
+ * THOMAS E. WALTON, JR.
+
+ * D. ALEXANDER WIELAND
+
+ * THOMAS A. WOOD, JR.
+
+
+ Footer
+
+ <#footer>
+
+*president:* HARRY G. KUCI, President
+
+*general_secretary:* A PARTICIPANT IN THE UNITED FUND CHARLES U.
+SHELLENBERGES, General Secretary
+
+*image:* img-1.jpeg
+
+Export Content
+
+Download as HTML
+
+{
+"file_name":"img-0.jpeg"
+"topics":[
+0:
+"Young Men's Christian Association"
+1:
+"Board of Managers"
+2:
+"Philadelphia"
+3:
+"Central Branch"
+]
+"languages":[
+0:
+"Chinese"
+]
+"ocr_contents":{
+"header":"THE YOUNG MEN'S CHRISTIAN ASSOCIATION"
+"image":"img-0.jpeg"
+"central_branch":{
+"address":"CENTRAL BRANCH"
+"location":"1421 ARCH STREET PHILADELPHIA 2, PENNA. LOCUST 3-8100"
+"title":"of Philadelphia and Vicinity"
+}
+"board_chairman":"THOMAS E. WALTON, Jr."
+"executive_secretary":"NORMAN I PULLER"
+"board_of_managers":[
+0:
+"JOSEPH DELACY"
+1:
+"WALTER R. DUNCAN"
+2:
+"RALPH B. EATON"
+3:
+"ALBERT H. FENSTERMACHER"
+4:
+"AUSTIN J. GAUGEL"
+5:
+"JOSEPH H. GEIS, JR."
+6:
+"ARTHUR R. GREMEL"
+7:
+"ELIOT B. HANSCOM"
+8:
+"CHARLES E. HANTHORN"
+9:
+"GEORGE IRELAND"
+10:
+"GEORGE L. KLEWER"
+11:
+"ALEXANDER H. RALSTON"
+12:
+"ARTHUR SAXON, JR."
+13:
+"HARRY W. SCHOB, SR."
+14:
+"WALTER J. SCHOB"
+15:
+"HARVEY T. STEPHENS"
+16:
+"CHARLES E. SWANSON"
+17:
+"LAWRENCE W. THOMASON"
+18:
+"THOMAS E. WALTON, JR."
+19:
+"D. ALEXANDER WIELAND"
+20:
+"THOMAS A. WOOD, JR."
+]
+"footer":{
+"president":"HARRY G. KUCI, President"
+"general_secretary":"A PARTICIPANT IN THE UNITED FUND CHARLES U. SHELLENBERGES, General Secretary"
+"image":"img-1.jpeg"
+}
+}
+"confidence_score":0.85
+"raw_response":"OCRResponse(pages=[OCRPageObject(index=0, markdown="THE YOUNG MEN'S CHRISTIAN ASSOCIATION\n\n\nCENTRAL BRANCH\n1421 ARCH STREET PHILADELPHIA 2, PENNA. LOCUST 3-8100\nof Philadelphia and Vicinity\n\nTHOMAS E. WALTON, Jr. Board Chairman\n\nNORMAN I PULLER Executive Secretary\n\nCENTRAL BRANCH BOARD OF MANAGESS JOSEPH DELACY WALTER R. DUNCAN RALPH B. EATON ALBERT H. FENSTERMACHER AUSTIN J. GAUGEL JOSEPH H. GEIS, JR. ARTHUR R. GREMEL ELIOT B. HANSCOM CHARLES E. HANTHORN GEORGE IRELAND GEORGE L. KLEWER ALEXANDER H. RALSTON ARTHUR SAXON, JR. HARRY W. SCHOB, SR. WALTER J. SCHOB HARVEY T. STEPHENS CHARLES E. SWANSON LAWRENCE W. THOMASON THOMAS E. WALTON, JR. D. ALEXANDER WIELAND THOMAS A. WOOD, JR.\n", images=[OCRImageObject(id='img-0.jpeg', top_left_x=643, top_left_y=30, bottom_right_x=864, bottom_right_y=260, image_base64=''), OCRImageObject(id='img-1.jpeg', top_left_x=315, top_left_y=352, bottom_right_x=1298, bottom_right_y=1594, image_base64='')], dimensions=OCRPageDimensions(dpi=200, height=1678, width=1300))], model='mistral-ocr-2503-completion', usage_info=OCRUsageInfo(pages_processed=1, doc_size_bytes=94073))"
+"processing_time":8.566082954406738
+}
+THE YOUNG MEN'S CHRISTIAN ASSOCIATION
+
+img-0.jpeg
+
+CENTRAL BRANCH
+
+1421 ARCH STREET PHILADELPHIA 2, PENNA. LOCUST 3-8100
+
+of Philadelphia and Vicinity
+
+THOMAS E. WALTON, Jr. Board Chairman
+
+NORMAN I PULLER Executive Secretary
+
+CENTRAL BRANCH BOARD OF MANAGESS JOSEPH DELACY WALTER R. DUNCAN RALPH B.
+EATON ALBERT H. FENSTERMACHER AUSTIN J. GAUGEL JOSEPH H. GEIS, JR.
+ARTHUR R. GREMEL ELIOT B. HANSCOM CHARLES E. HANTHORN GEORGE IRELAND
+GEORGE L. KLEWER ALEXANDER H. RALSTON ARTHUR SAXON, JR. HARRY W. SCHOB,
+SR. WALTER J. SCHOB HARVEY T. STEPHENS CHARLES E. SWANSON LAWRENCE W.
+THOMASON THOMAS E. WALTON, JR. D. ALEXANDER WIELAND THOMAS A. WOOD, JR.
+
+img-1.jpeg
+
+
+ Raw Processing Results
+
+ <#raw-processing-results>
+
+{
+"file_name":"img-0.jpeg"
+"topics":[
+0:
+"Young Men's Christian Association"
+1:
+"Board of Managers"
+2:
+"Philadelphia"
+3:
+"Central Branch"
+]
+"languages":[
+0:
+"Chinese"
+]
+"ocr_contents":{
+"header":"THE YOUNG MEN'S CHRISTIAN ASSOCIATION"
+"image":"img-0.jpeg"
+"central_branch":{
+"address":"CENTRAL BRANCH"
+"location":"1421 ARCH STREET PHILADELPHIA 2, PENNA. LOCUST 3-8100"
+"title":"of Philadelphia and Vicinity"
+}
+"board_chairman":"THOMAS E. WALTON, Jr."
+"executive_secretary":"NORMAN I PULLER"
+"board_of_managers":[
+0:
+"JOSEPH DELACY"
+1:
+"WALTER R. DUNCAN"
+2:
+"RALPH B. EATON"
+3:
+"ALBERT H. FENSTERMACHER"
+4:
+"AUSTIN J. GAUGEL"
+5:
+"JOSEPH H. GEIS, JR."
+6:
+"ARTHUR R. GREMEL"
+7:
+"ELIOT B. HANSCOM"
+8:
+"CHARLES E. HANTHORN"
+9:
+"GEORGE IRELAND"
+10:
+"GEORGE L. KLEWER"
+11:
+"ALEXANDER H. RALSTON"
+12:
+"ARTHUR SAXON, JR."
+13:
+"HARRY W. SCHOB, SR."
+14:
+"WALTER J. SCHOB"
+15:
+"HARVEY T. STEPHENS"
+16:
+"CHARLES E. SWANSON"
+17:
+"LAWRENCE W. THOMASON"
+18:
+"THOMAS E. WALTON, JR."
+19:
+"D. ALEXANDER WIELAND"
+20:
+"THOMAS A. WOOD, JR."
+]
+"footer":{
+"president":"HARRY G. KUCI, President"
+"general_secretary":"A PARTICIPANT IN THE UNITED FUND CHARLES U. SHELLENBERGES, General Secretary"
+"image":"img-1.jpeg"
+}
+}
+"confidence_score":0.85
+"raw_response":"OCRResponse(pages=[OCRPageObject(index=0, markdown="THE YOUNG MEN'S CHRISTIAN ASSOCIATION\n\n\nCENTRAL BRANCH\n1421 ARCH STREET PHILADELPHIA 2, PENNA. LOCUST 3-8100\nof Philadelphia and Vicinity\n\nTHOMAS E. WALTON, Jr. Board Chairman\n\nNORMAN I PULLER Executive Secretary\n\nCENTRAL BRANCH BOARD OF MANAGESS JOSEPH DELACY WALTER R. DUNCAN RALPH B. EATON ALBERT H. FENSTERMACHER AUSTIN J. GAUGEL JOSEPH H. GEIS, JR. ARTHUR R. GREMEL ELIOT B. HANSCOM CHARLES E. HANTHORN GEORGE IRELAND GEORGE L. KLEWER ALEXANDER H. RALSTON ARTHUR SAXON, JR. HARRY W. SCHOB, SR. WALTER J. SCHOB HARVEY T. STEPHENS CHARLES E. SWANSON LAWRENCE W. THOMASON THOMAS E. WALTON, JR. D. ALEXANDER WIELAND THOMAS A. WOOD, JR.\n", images=[OCRImageObject(id='img-0.jpeg', top_left_x=643, top_left_y=30, bottom_right_x=864, bottom_right_y=260, image_base64=''), OCRImageObject(id='img-1.jpeg', top_left_x=315, top_left_y=352, bottom_right_x=1298, bottom_right_y=1594, image_base64='')], dimensions=OCRPageDimensions(dpi=200, height=1678, width=1300))], model='mistral-ocr-2503-completion', usage_info=OCRUsageInfo(pages_processed=1, doc_size_bytes=94073))"
+"processing_time":8.566082954406738
+}
+
+
+ About This Application
+
+ <#about-this-application>
+
+This app uses Mistral AI's Document OCR to extract text and images from historical
+documents.
+
+It can process:
+
+ * Image files (jpg, png, etc.)
+ * PDF documents (multi-page support)
+
+The extracted content is processed into structured data based on the
+document type, combining:
+
+ * Text extraction with |mistral-ocr-latest|
+ * Analysis with language models
+ * Layout preservation with images
+
+View results in three formats:
+
+ * Structured HTML view
+ * Raw JSON (for developers)
+ * Markdown with images (preserves document layout)
+
+*New Features:*
+
+ * Image preprocessing for better OCR quality
+ * PDF resolution and page controls
+ * Progress tracking during processing
+
diff --git a/output/recipe_test.json b/output/recipe_test.json
new file mode 100644
index 0000000000000000000000000000000000000000..e6de184505b2b91c41cfaa25d95b1091e0a991e4
--- /dev/null
+++ b/output/recipe_test.json
@@ -0,0 +1,16 @@
+{
+ "file_name": "img-0.jpeg",
+ "topics": [
+ "Cooking",
+ "Recipes",
+ "Baking"
+ ],
+ "languages": [
+ "English"
+ ],
+ "ocr_contents": {
+ "title": "Pecan Butterballs Cookies",
+ "recipe": "1 cup butter, creamy if possible\n1/4 inch honey\n2 \" ounces flour\n1/2 teaspoon salt\n2 \" ounces pecans\n2 cups finely chopped pecans\nForm into small balls, bake at 300 40-45 min roll in uncoated sugar"
+ },
+ "confidence_score": 0.85,
+ "raw_response":
\ No newline at end of file
diff --git a/output/ymca-letter.jpg b/output/ymca-letter.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9f657aa990425c29e571e523938ba1d82b79df13
--- /dev/null
+++ b/output/ymca-letter.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22e0102df7d37ad482169f796435fa228a0b42b2a1661380044f781589ccbac8
+size 210565
diff --git a/packages.txt b/packages.txt
index f6afc4d1a00929ae04e5658d8d584556f0b71b50..c1298f5aebdba9b2e9b819d036559cac9e687863 100644
--- a/packages.txt
+++ b/packages.txt
@@ -1,2 +1,2 @@
-poppler-utils
tesseract-ocr
+poppler-utils
\ No newline at end of file
diff --git a/prepare_for_hf.py b/prepare_for_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c1ce53e16789faf3ff5634e55ac67b3a3eac54a
--- /dev/null
+++ b/prepare_for_hf.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""
+Prepare the repository for Hugging Face Spaces deployment.
+This script:
+1. Creates a requirements.txt file with only the necessary dependencies
+2. Ensures app.py is ready for HF deployment
+3. Makes sure all configuration files are properly set up
+4. Configures Hugging Face module integration if needed
+"""
+
+import os
+import shutil
+import sys
+from pathlib import Path
+
+# Configuration for HF module
+HF_MODULE_ENABLED = True # Set to False to disable the educational module
+
+def setup_hf_module():
+ """Setup the Hugging Face educational module if enabled"""
+ if not HF_MODULE_ENABLED:
+ print("Hugging Face educational module is disabled.")
+ return
+
+ print("Setting up Hugging Face educational module...")
+
+ # Ensure directories exist
+ for directory in ["modules", "ui"]:
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+ print(f"Created {directory} directory")
+
+ # Check if module files exist
+ required_files = ["streamlit_app.py", "modules/modular_app.py", "ui/layout.py"]
+ missing_files = [f for f in required_files if not os.path.exists(f)]
+
+ if missing_files:
+ print("Warning: Some module files are missing:")
+ for file in missing_files:
+ print(f" - {file}")
+ print("The educational version may not work correctly.")
+ else:
+ print("All required module files are present.")
+
+def main():
+ print("Preparing repository for Hugging Face Spaces deployment...")
+
+ # Make sure output directory exists
+ if not os.path.exists("output"):
+ os.makedirs("output")
+ print("Created output directory")
+
+ # Clean up unnecessary files
+ files_to_remove = [".env", ".env.example", ".git"]
+ for file in files_to_remove:
+ if os.path.exists(file):
+ if os.path.isdir(file):
+ shutil.rmtree(file)
+ else:
+ os.remove(file)
+ print(f"Removed {file}")
+
+ # Check requirements.txt exists
+ if not os.path.exists("requirements.txt"):
+ print("ERROR: requirements.txt not found. Please create it before deploying.")
+ sys.exit(1)
+
+ # Make sure run_local.sh is executable
+ if os.path.exists("run_local.sh"):
+ os.chmod("run_local.sh", 0o755)
+ print("Made run_local.sh executable")
+
+ # Configure HF module if enabled
+ setup_hf_module()
+
+ # Remove any large unnecessary files from input directory
+ # Keep only sample files that are needed for demos
+ print("NOTE: Large files in the input directory will be uploaded to Hugging Face.")
+ print("You may want to remove unnecessary files before deployment.")
+
+ print("Repository preparation complete!")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/preprocessing.py b/preprocessing.py
deleted file mode 100644
index 9ad43d18919d0ab9e3fd2e1d36cdf6c63ae55af8..0000000000000000000000000000000000000000
--- a/preprocessing.py
+++ /dev/null
@@ -1,637 +0,0 @@
-import os
-import io
-import cv2
-import numpy as np
-import tempfile
-import time
-import math
-import json
-from PIL import Image, ImageEnhance, ImageFilter
-from pdf2image import convert_from_bytes
-import streamlit as st
-import logging
-import concurrent.futures
-from pathlib import Path
-
-# Configure logging
-logger = logging.getLogger("preprocessing")
-logger.setLevel(logging.INFO)
-
-# Ensure logs directory exists
-def ensure_log_directory(config):
- """Create logs directory if it doesn't exist"""
- if config.get("logging", {}).get("enabled", False):
- log_path = config.get("logging", {}).get("output_path", "logs/preprocessing_metrics.json")
- log_dir = os.path.dirname(log_path)
- if log_dir:
- Path(log_dir).mkdir(parents=True, exist_ok=True)
-
-def log_preprocessing_metrics(metrics, config):
- """Log preprocessing metrics to JSON file"""
- if not config.get("enabled", False):
- return
-
- log_path = config.get("output_path", "logs/preprocessing_metrics.json")
- ensure_log_directory({"logging": {"enabled": True, "output_path": log_path}})
-
- # Add timestamp
- metrics["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
-
- # Append to log file
- try:
- existing_data = []
- if os.path.exists(log_path):
- with open(log_path, 'r') as f:
- existing_data = json.load(f)
- if not isinstance(existing_data, list):
- existing_data = [existing_data]
-
- existing_data.append(metrics)
-
- with open(log_path, 'w') as f:
- json.dump(existing_data, f, indent=2)
-
- logger.info(f"Logged preprocessing metrics to {log_path}")
- except Exception as e:
- logger.error(f"Error logging preprocessing metrics: {str(e)}")
-
-def get_document_config(document_type, global_config):
- """
- Get document-specific preprocessing configuration by merging with global settings.
-
- Args:
- document_type: The type of document (e.g., 'standard', 'newspaper', 'handwritten')
- global_config: The global preprocessing configuration
-
- Returns:
- A merged configuration dictionary with document-specific overrides
- """
- # Start with a copy of the global config
- config = {
- "deskew": global_config.get("deskew", {}),
- "thresholding": global_config.get("thresholding", {}),
- "morphology": global_config.get("morphology", {}),
- "performance": global_config.get("performance", {}),
- "logging": global_config.get("logging", {})
- }
-
- # Apply document-specific overrides if they exist
- doc_types = global_config.get("document_types", {})
- if document_type in doc_types:
- doc_config = doc_types[document_type]
-
- # Merge document-specific settings into the config
- for section in doc_config:
- if section in config:
- config[section].update(doc_config[section])
-
- return config
-
-def deskew_image(img_array, config):
- """
- Detect and correct skew in document images.
-
- Uses a combination of methods (minAreaRect and/or Hough transform)
- to estimate the skew angle more robustly.
-
- Args:
- img_array: Input image as numpy array
- config: Deskew configuration dict
-
- Returns:
- Deskewed image as numpy array, estimated angle, success flag
- """
- if not config.get("enabled", False):
- return img_array, 0.0, True
-
- # Convert to grayscale if needed
- gray = img_array if len(img_array.shape) == 2 else cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-
- # Start with a threshold to get binary image for angle detection
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- angles = []
- angle_threshold = config.get("angle_threshold", 0.1)
- max_angle = config.get("max_angle", 45.0)
-
- # Method 1: minAreaRect approach
- try:
- # Find all contours
- contours, _ = cv2.findContours(binary, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-
- # Filter contours by area to avoid noise
- min_area = binary.shape[0] * binary.shape[1] * 0.0001 # 0.01% of image area
- filtered_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_area]
-
- # Get angles from rotated rectangles around contours
- for contour in filtered_contours:
- rect = cv2.minAreaRect(contour)
- width, height = rect[1]
-
- # Calculate the angle based on the longer side
- # (This is important for getting the orientation right)
- angle = rect[2]
- if width < height:
- angle += 90
-
- # Normalize angle to -45 to 45 range
- if angle > 45:
- angle -= 90
- if angle < -45:
- angle += 90
-
- # Clamp angle to max limit
- angle = max(min(angle, max_angle), -max_angle)
- angles.append(angle)
- except Exception as e:
- logger.error(f"Error in minAreaRect skew detection: {str(e)}")
-
- # Method 2: Hough Transform approach (if enabled)
- if config.get("use_hough", True):
- try:
- # Apply Canny edge detection
- edges = cv2.Canny(gray, 50, 150, apertureSize=3)
-
- # Apply Hough lines
- lines = cv2.HoughLinesP(edges, 1, np.pi/180,
- threshold=100, minLineLength=100, maxLineGap=10)
-
- if lines is not None:
- for line in lines:
- x1, y1, x2, y2 = line[0]
- if x2 - x1 != 0: # Avoid division by zero
- # Calculate line angle in degrees
- angle = math.atan2(y2 - y1, x2 - x1) * 180.0 / np.pi
-
- # Normalize angle to -45 to 45 range
- if angle > 45:
- angle -= 90
- if angle < -45:
- angle += 90
-
- # Clamp angle to max limit
- angle = max(min(angle, max_angle), -max_angle)
- angles.append(angle)
- except Exception as e:
- logger.error(f"Error in Hough transform skew detection: {str(e)}")
-
- # If no angles were detected, return original image
- if not angles:
- logger.warning("No skew angles detected, using original image")
- return img_array, 0.0, False
-
- # Combine angles using the specified consensus method
- consensus_method = config.get("consensus_method", "average")
- if consensus_method == "average":
- final_angle = sum(angles) / len(angles)
- elif consensus_method == "median":
- final_angle = sorted(angles)[len(angles) // 2]
- elif consensus_method == "min":
- final_angle = min(angles, key=abs)
- elif consensus_method == "max":
- final_angle = max(angles, key=abs)
- else:
- final_angle = sum(angles) / len(angles) # Default to average
-
- # If angle is below threshold, don't rotate
- if abs(final_angle) < angle_threshold:
- logger.info(f"Detected angle ({final_angle:.2f}°) is below threshold, skipping deskew")
- return img_array, final_angle, True
-
- # Log the detected angle
- logger.info(f"Deskewing image with angle: {final_angle:.2f}°")
-
- # Get image dimensions
- h, w = img_array.shape[:2]
- center = (w // 2, h // 2)
-
- # Get rotation matrix
- rotation_matrix = cv2.getRotationMatrix2D(center, final_angle, 1.0)
-
- # Calculate new image dimensions
- abs_cos = abs(rotation_matrix[0, 0])
- abs_sin = abs(rotation_matrix[0, 1])
- new_w = int(h * abs_sin + w * abs_cos)
- new_h = int(h * abs_cos + w * abs_sin)
-
- # Adjust the rotation matrix to account for new dimensions
- rotation_matrix[0, 2] += (new_w / 2) - center[0]
- rotation_matrix[1, 2] += (new_h / 2) - center[1]
-
- # Perform the rotation
- try:
- # Determine the number of channels to create the correct output array
- if len(img_array.shape) == 3:
- rotated = cv2.warpAffine(img_array, rotation_matrix, (new_w, new_h),
- flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT,
- borderValue=(255, 255, 255))
- else:
- rotated = cv2.warpAffine(img_array, rotation_matrix, (new_w, new_h),
- flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT,
- borderValue=255)
- return rotated, final_angle, True
- except Exception as e:
- logger.error(f"Error rotating image: {str(e)}")
- if config.get("fallback", {}).get("enabled", True):
- logger.info("Using original image as fallback after rotation failure")
- return img_array, final_angle, False
- return img_array, final_angle, False
-
-def preblur(img_array, config):
- """
- Apply pre-filtering blur to stabilize thresholding results.
-
- Args:
- img_array: Input image as numpy array
- config: Pre-blur configuration dict
-
- Returns:
- Blurred image as numpy array
- """
- if not config.get("enabled", False):
- return img_array
-
- method = config.get("method", "gaussian")
- kernel_size = config.get("kernel_size", 3)
-
- # Ensure kernel size is odd
- if kernel_size % 2 == 0:
- kernel_size += 1
-
- try:
- if method == "gaussian":
- return cv2.GaussianBlur(img_array, (kernel_size, kernel_size), 0)
- elif method == "median":
- return cv2.medianBlur(img_array, kernel_size)
- else:
- logger.warning(f"Unknown blur method: {method}, using gaussian")
- return cv2.GaussianBlur(img_array, (kernel_size, kernel_size), 0)
- except Exception as e:
- logger.error(f"Error applying {method} blur: {str(e)}")
- return img_array
-
-def apply_threshold(img_array, config):
- """
- Apply thresholding to create binary image.
-
- Supports Otsu's method and adaptive thresholding.
- Includes pre-filtering and fallback mechanisms.
-
- Args:
- img_array: Input image as numpy array
- config: Thresholding configuration dict
-
- Returns:
- Binary image as numpy array, success flag
- """
- method = config.get("method", "adaptive")
- if method == "none":
- return img_array, True
-
- # Convert to grayscale if needed
- gray = img_array if len(img_array.shape) == 2 else cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-
- # Apply pre-blur if configured
- preblur_config = config.get("preblur", {})
- if preblur_config.get("enabled", False):
- gray = preblur(gray, preblur_config)
-
- binary = None
- try:
- if method == "otsu":
- # Apply Otsu's thresholding
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
- elif method == "adaptive":
- # Apply adaptive thresholding
- block_size = config.get("adaptive_block_size", 11)
- constant = config.get("adaptive_constant", 2)
-
- # Ensure block size is odd
- if block_size % 2 == 0:
- block_size += 1
-
- binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
- cv2.THRESH_BINARY, block_size, constant)
- else:
- logger.warning(f"Unknown thresholding method: {method}, using adaptive")
- block_size = config.get("adaptive_block_size", 11)
- constant = config.get("adaptive_constant", 2)
-
- # Ensure block size is odd
- if block_size % 2 == 0:
- block_size += 1
-
- binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
- cv2.THRESH_BINARY, block_size, constant)
- except Exception as e:
- logger.error(f"Error applying {method} thresholding: {str(e)}")
- if config.get("fallback", {}).get("enabled", True):
- logger.info("Using original grayscale image as fallback after thresholding failure")
- return gray, False
- return gray, False
-
- # Calculate percentage of non-zero pixels for logging
- nonzero_pct = np.count_nonzero(binary) / binary.size * 100
- logger.info(f"Binary image has {nonzero_pct:.2f}% non-zero pixels")
-
- # Check if thresholding was successful (crude check)
- if nonzero_pct < 1 or nonzero_pct > 99:
- logger.warning(f"Thresholding produced extreme result ({nonzero_pct:.2f}% non-zero)")
- if config.get("fallback", {}).get("enabled", True):
- logger.info("Using original grayscale image as fallback after poor thresholding")
- return gray, False
-
- return binary, True
-
-def apply_morphology(binary_img, config):
- """
- Apply morphological operations to clean up binary image.
-
- Supports opening, closing, or both operations.
-
- Args:
- binary_img: Binary image as numpy array
- config: Morphology configuration dict
-
- Returns:
- Processed binary image as numpy array
- """
- if not config.get("enabled", False):
- return binary_img
-
- operation = config.get("operation", "close")
- kernel_size = config.get("kernel_size", 1)
- kernel_shape = config.get("kernel_shape", "rect")
-
- # Create appropriate kernel
- if kernel_shape == "rect":
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size*2+1, kernel_size*2+1))
- elif kernel_shape == "ellipse":
- kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size*2+1, kernel_size*2+1))
- elif kernel_shape == "cross":
- kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (kernel_size*2+1, kernel_size*2+1))
- else:
- logger.warning(f"Unknown kernel shape: {kernel_shape}, using rect")
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_size*2+1, kernel_size*2+1))
-
- result = binary_img
- try:
- if operation == "open":
- # Opening: Erosion followed by dilation - removes small noise
- result = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, kernel)
- elif operation == "close":
- # Closing: Dilation followed by erosion - fills small holes
- result = cv2.morphologyEx(binary_img, cv2.MORPH_CLOSE, kernel)
- elif operation == "both":
- # Both operations in sequence
- result = cv2.morphologyEx(binary_img, cv2.MORPH_OPEN, kernel)
- result = cv2.morphologyEx(result, cv2.MORPH_CLOSE, kernel)
- else:
- logger.warning(f"Unknown morphological operation: {operation}, using close")
- result = cv2.morphologyEx(binary_img, cv2.MORPH_CLOSE, kernel)
- except Exception as e:
- logger.error(f"Error applying morphological operation: {str(e)}")
- return binary_img
-
- return result
-
-@st.cache_data(ttl=24*3600, show_spinner=False) # Cache for 24 hours
-def convert_pdf_to_images(pdf_bytes, dpi=150, rotation=0):
- """Convert PDF bytes to a list of images with caching"""
- try:
- images = convert_from_bytes(pdf_bytes, dpi=dpi)
-
- # Apply rotation if specified
- if rotation != 0 and images:
- rotated_images = []
- for img in images:
- rotated_img = img.rotate(rotation, expand=True, resample=Image.BICUBIC)
- rotated_images.append(rotated_img)
- return rotated_images
-
- return images
- except Exception as e:
- st.error(f"Error converting PDF: {str(e)}")
- logger.error(f"PDF conversion error: {str(e)}")
- return []
-
-@st.cache_data(ttl=24*3600, show_spinner=False, hash_funcs={dict: lambda x: str(sorted(x.items()))})
-def preprocess_image(image_bytes, preprocessing_options):
- """
- Conservative preprocessing function for handwritten documents with early exit for clean scans.
- Implements light processing: grayscale → denoise (gently) → contrast (conservative)
-
- Args:
- image_bytes: Image content as bytes
- preprocessing_options: Dictionary with document_type, grayscale, denoise, contrast options
-
- Returns:
- Processed image bytes or original image bytes if no processing needed
- """
- # Setup basic console logging
- logger = logging.getLogger("image_preprocessor")
- logger.setLevel(logging.INFO)
-
- # Log which preprocessing options are being applied
- logger.info(f"Document type: {preprocessing_options.get('document_type', 'standard')}")
-
- # Check if any preprocessing is actually requested
- has_preprocessing = (
- preprocessing_options.get("grayscale", False) or
- preprocessing_options.get("denoise", False) or
- preprocessing_options.get("contrast", 0) != 0
- )
-
- # Convert bytes to PIL Image
- image = Image.open(io.BytesIO(image_bytes))
-
- # Check for minimal skew and exit early if document is already straight
- # This avoids unnecessary processing for clean scans
- try:
- from utils.image_utils import detect_skew
- skew_angle = detect_skew(image)
- if abs(skew_angle) < 0.5:
- logger.info(f"Document has minimal skew ({skew_angle:.2f}°), skipping preprocessing")
- # Return original image bytes as is for perfectly straight documents
- if not has_preprocessing:
- return image_bytes
- except Exception as e:
- logger.warning(f"Error in skew detection: {str(e)}, continuing with preprocessing")
-
- # If no preprocessing options are selected, return the original image
- if not has_preprocessing:
- logger.info("No preprocessing options selected, skipping preprocessing")
- return image_bytes
-
- # Initialize metrics for logging
- metrics = {
- "file": preprocessing_options.get("filename", "unknown"),
- "document_type": preprocessing_options.get("document_type", "standard"),
- "preprocessing_applied": []
- }
- start_time = time.time()
-
- # Handle RGBA images (transparency) by converting to RGB
- if image.mode == 'RGBA':
- # Convert RGBA to RGB by compositing onto white background
- logger.info("Converting RGBA image to RGB")
- background = Image.new('RGB', image.size, (255, 255, 255))
- background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
- image = background
- metrics["preprocessing_applied"].append("alpha_conversion")
- elif image.mode not in ('RGB', 'L'):
- # Convert other modes to RGB
- logger.info(f"Converting {image.mode} image to RGB")
- image = image.convert('RGB')
- metrics["preprocessing_applied"].append("format_conversion")
-
- # Convert to NumPy array for OpenCV processing
- img_array = np.array(image)
-
- # Apply grayscale if requested (useful for handwritten text)
- if preprocessing_options.get("grayscale", False):
- if len(img_array.shape) == 3: # Only convert if it's not already grayscale
- # For handwritten documents, apply gentle CLAHE to enhance contrast locally
- if preprocessing_options.get("document_type") == "handwritten":
- img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
- clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(8,8)) # Conservative clip limit
- img_array = clahe.apply(img_array)
- else:
- # Standard grayscale for printed documents
- img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-
- metrics["preprocessing_applied"].append("grayscale")
-
- # Apply light denoising if requested
- if preprocessing_options.get("denoise", False):
- try:
- # Apply very gentle denoising
- is_color = len(img_array.shape) == 3 and img_array.shape[2] == 3
- if is_color:
- # Very light color denoising with conservative parameters
- img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 2, 2, 3, 7)
- else:
- # Very light grayscale denoising
- img_array = cv2.fastNlMeansDenoising(img_array, None, 2, 3, 7)
-
- metrics["preprocessing_applied"].append("light_denoise")
- except Exception as e:
- logger.error(f"Denoising error: {str(e)}")
-
- # Apply contrast adjustment if requested (conservative range)
- contrast_value = preprocessing_options.get("contrast", 0)
- if contrast_value != 0:
- # Use a gentler contrast adjustment factor
- contrast_factor = 1 + (contrast_value / 200) # Conservative scaling factor
-
- # Convert NumPy array back to PIL Image for contrast adjustment
- if len(img_array.shape) == 2: # If grayscale, convert to RGB for PIL
- image = Image.fromarray(cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB))
- else:
- image = Image.fromarray(img_array)
-
- enhancer = ImageEnhance.Contrast(image)
- image = enhancer.enhance(contrast_factor)
-
- # Convert back to NumPy array
- img_array = np.array(image)
- metrics["preprocessing_applied"].append(f"contrast_{contrast_value}")
-
- # Convert back to PIL Image
- if len(img_array.shape) == 2: # If grayscale, convert to RGB for saving
- processed_image = Image.fromarray(cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB))
- else:
- processed_image = Image.fromarray(img_array)
-
- # Record total processing time
- metrics["processing_time"] = (time.time() - start_time) * 1000 # ms
-
- # Higher quality for OCR processing
- byte_io = io.BytesIO()
- try:
- # Make sure the image is in RGB mode before saving as JPEG
- if processed_image.mode not in ('RGB', 'L'):
- processed_image = processed_image.convert('RGB')
-
- processed_image.save(byte_io, format='JPEG', quality=92, optimize=True)
- byte_io.seek(0)
-
- logger.info(f"Preprocessing complete. Original image mode: {image.mode}, processed mode: {processed_image.mode}")
- logger.info(f"Original size: {len(image_bytes)/1024:.1f}KB, processed size: {len(byte_io.getvalue())/1024:.1f}KB")
- logger.info(f"Applied preprocessing steps: {', '.join(metrics['preprocessing_applied'])}")
-
- return byte_io.getvalue()
- except Exception as e:
- logger.error(f"Error saving processed image: {str(e)}")
- # Fallback to original image
- logger.info("Using original image as fallback")
- return image_bytes
-
-def create_temp_file(content, suffix, temp_file_paths):
- """Create a temporary file and track it for cleanup"""
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
- tmp.write(content)
- temp_path = tmp.name
- # Track temporary file for cleanup
- temp_file_paths.append(temp_path)
- logger.info(f"Created temporary file: {temp_path}")
- return temp_path
-
-def apply_preprocessing_to_file(file_bytes, file_ext, preprocessing_options, temp_file_paths):
- """
- Apply conservative preprocessing to file and return path to the temporary file.
- Handles format conversion and user-selected preprocessing options.
-
- Args:
- file_bytes: File content as bytes
- file_ext: File extension (e.g., '.jpg', '.pdf')
- preprocessing_options: Dictionary with document_type and preprocessing options
- temp_file_paths: List to track temporary files for cleanup
-
- Returns:
- Tuple of (temp_file_path, was_processed_flag)
- """
- document_type = preprocessing_options.get("document_type", "standard")
-
- # Check for user-selected preprocessing
- has_preprocessing = (
- preprocessing_options.get("grayscale", False) or
- preprocessing_options.get("denoise", False) or
- preprocessing_options.get("contrast", 0) != 0
- )
-
- # Check for RGBA/transparency that needs conversion
- format_needs_conversion = False
-
- # Only check formats that might have transparency
- if file_ext.lower() in ['.png', '.tif', '.tiff']:
- try:
- # Check if image has transparency
- image = Image.open(io.BytesIO(file_bytes))
- if image.mode == 'RGBA' or image.mode not in ('RGB', 'L'):
- format_needs_conversion = True
- except Exception as e:
- logger.warning(f"Error checking image format: {str(e)}")
-
- # Process if user requested preprocessing OR format needs conversion
- needs_processing = has_preprocessing or format_needs_conversion
-
- if needs_processing:
- # Apply preprocessing
- logger.info(f"Applying preprocessing with options: {preprocessing_options}")
- logger.info(f"Using document type '{document_type}' with advanced preprocessing options")
-
- # Add filename to preprocessing options for logging if available
- if hasattr(file_bytes, 'name'):
- preprocessing_options["filename"] = file_bytes.name
-
- processed_bytes = preprocess_image(file_bytes, preprocessing_options)
-
- # Save processed image to temp file
- temp_path = create_temp_file(processed_bytes, file_ext, temp_file_paths)
- return temp_path, True # Return path and flag indicating preprocessing was applied
- else:
- # No preprocessing needed, just save the original file
- logger.info("No preprocessing applied - using original image")
- temp_path = create_temp_file(file_bytes, file_ext, temp_file_paths)
- return temp_path, False # Return path and flag indicating no preprocessing was applied
diff --git a/process_file.py b/process_file.py
index 6195b15ad669892adde85ce058feac0f192016b8..e7dfb5bde566f22aa86a9e8104e88bf5989365ee 100644
--- a/process_file.py
+++ b/process_file.py
@@ -53,7 +53,7 @@ def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=N
"file_size_mb": round(file_size_mb, 2),
"use_vision": use_vision
})
-
+
return result
except Exception as e:
return {
@@ -63,4 +63,4 @@ def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=N
finally:
# Clean up the temporary file
if os.path.exists(temp_path):
- os.unlink(temp_path)
+ os.unlink(temp_path)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 22ac57441a5d1d21ec79d17b116a6da46ab78ab2..cd8bfec2f8a8e2662446f6ae690ed53227bf7b47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,24 +1,10 @@
-# Requirements for Historical OCR application
-
-# Core dependencies
-streamlit>=1.30.0
-mistralai>=0.1.0 # Updated to latest Mistral AI SDK
-pydantic>=2.5.0 # Updated for better BaseModel support
-
-# Image processing
-Pillow>=10.0.0
-opencv-python-headless>=4.8.0.74
-pdf2image>=1.16.0
-pytesseract>=0.3.10 # For local OCR fallback
-matplotlib>=3.7.0 # For visualization in preprocessing tests
-
-# Data handling and utilities
-numpy>=1.24.0
-pycountry>=22.1.10
-requests>=2.31.0
-python-dotenv>=1.0.0
+streamlit>=1.43.2
+mistralai>=0.0.7
+pydantic>=2.0.0
+pycountry>=23.12.11
+pillow>=10.0.0
python-multipart>=0.0.6
-
-# Type checking and linting
-mypy>=1.5.0
-ruff>=0.1.5
+pdf2image>=1.17.0
+pytesseract>=0.3.10
+opencv-python-headless>=4.6.0
+numpy>=1.23.5
\ No newline at end of file
diff --git a/run_large_files.sh b/run_large_files.sh
new file mode 100644
index 0000000000000000000000000000000000000000..743833fe12bf1c017e9457601a79447dc9cb51a8
--- /dev/null
+++ b/run_large_files.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Run the Streamlit app with increased buffer size for large files
+
+# Load environment variables from .env file if it exists
+if [ -f .env ]; then
+ echo "Loading environment variables from .env file"
+ set -o allexport
+ source .env
+ set +o allexport
+else
+ echo "No .env file found. Using API key from config.py"
+fi
+
+# Check if MISTRAL_API_KEY is set
+if [ -z "$MISTRAL_API_KEY" ]; then
+ echo "WARNING: MISTRAL_API_KEY is not set in environment. Using key from config.py if available."
+fi
+
+# Run the Streamlit app with increased buffer size
+streamlit run app.py \
+ --server.maxUploadSize=500 \
+ --server.maxMessageSize=500
\ No newline at end of file
diff --git a/run_local.sh b/run_local.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5e66df0fc27162d901ac9e8a90516e231b19ec97
--- /dev/null
+++ b/run_local.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Determine which version of the app to run
+if [ "$1" == "educational" ]; then
+ APP_FILE="streamlit_app.py"
+ echo "Starting Educational Version..."
+else
+ APP_FILE="app.py"
+ echo "Starting Standard Version..."
+fi
+
+# Check if .env file exists and load it
+if [ -f .env ]; then
+ echo "Loading environment variables from .env file"
+ export $(grep -v '^#' .env | xargs)
+fi
+
+# Check if the Mistral API key is set
+if [ -z "${MISTRAL_API_KEY}" ]; then
+ echo "Warning: MISTRAL_API_KEY environment variable is not set."
+ echo "The application will run in demo mode with limited functionality."
+fi
+
+# Start the Streamlit app
+streamlit run $APP_FILE --server.maxUploadSize=50 --server.enableCORS=false --server.enableXsrfProtection=false
\ No newline at end of file
diff --git a/setup_git.sh b/setup_git.sh
new file mode 100644
index 0000000000000000000000000000000000000000..331be4e7e41b38ce4ccce2f3efaf6b0a64f4eead
--- /dev/null
+++ b/setup_git.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Setup git repository for Hugging Face Spaces
+
+# Check if HF_TOKEN environment variable is set
+if [ -z "$HF_TOKEN" ]; then
+ echo "Error: HF_TOKEN environment variable is not set."
+ echo "Please set it first with: export HF_TOKEN=your_hugging_face_token"
+ exit 1
+fi
+
+# Get your username
+echo "Enter your Hugging Face username:"
+read HF_USERNAME
+
+# Get the space name
+echo "Enter the name for your Hugging Face Space (e.g., historical-ocr):"
+read HF_SPACE
+
+# Prepare the files for deployment
+echo "Preparing files for deployment..."
+python3 prepare_for_hf.py
+
+# Initialize git
+git init
+git add .
+git commit -m "Initial commit"
+
+# Create the repository on Hugging Face
+echo "Creating and pushing to Hugging Face Space..."
+git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE
+huggingface-cli login --token $HF_TOKEN
+git push -u origin main
+
+echo "Deployment completed! Your app should be available at:"
+echo "https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE"
\ No newline at end of file
diff --git a/simple_test.py b/simple_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ee95cede8e3bfb76f7ac767e6133990d545dde5
--- /dev/null
+++ b/simple_test.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+Simple test script for structured_ocr.py
+"""
+
+import os
+import sys
+import json
+from pathlib import Path
+
+def main():
+ print("Testing OCR with a sample image file")
+
+ # Path to the sample image file
+ image_path = os.path.join("input", "recipe.jpg")
+
+ # Check if the file exists
+ if not os.path.isfile(image_path):
+ print(f"Error: Image file not found at {image_path}")
+ return
+
+ print(f"File found: {image_path}")
+
+ # Create the output directory if it doesn't exist
+ output_dir = "output"
+ os.makedirs(output_dir, exist_ok=True)
+
+ output_path = os.path.join(output_dir, "recipe_test.json")
+
+ # Import the StructuredOCR class
+ from structured_ocr import StructuredOCR
+
+ # Initialize OCR processor
+ processor = StructuredOCR()
+
+ try:
+ # Process the image file
+ print(f"Processing image file: {image_path}")
+ result = processor.process_file(image_path, file_type="image")
+
+ # Save the result to the output file
+ with open(output_path, 'w') as f:
+ json.dump(result, f, indent=2)
+
+ print(f"Image processing completed successfully. Output saved to {output_path}")
+
+ # Check if the output file exists
+ if os.path.isfile(output_path):
+ print(f"Output file exists at {output_path}")
+ # Print the file size
+ file_size = os.path.getsize(output_path)
+ print(f"Output file size: {file_size} bytes")
+
+ # Print a preview of the output file
+ print("\nPreview of output file:")
+ with open(output_path, 'r') as f:
+ data = json.load(f)
+ print(f"File name: {data.get('file_name', '')}")
+ print(f"Topics: {', '.join(data.get('topics', []))}")
+ print(f"Languages: {', '.join(data.get('languages', []))}")
+ print("OCR contents keys:", list(data.get('ocr_contents', {}).keys()))
+ else:
+ print(f"Error: Output file not found at {output_path}")
+
+ except Exception as e:
+ print(f"Error processing image: {e}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/static/favicon.ico b/static/favicon.ico
deleted file mode 100644
index 7aec5b7630b636cd1cf2635bb21ffd73b465f019..0000000000000000000000000000000000000000
Binary files a/static/favicon.ico and /dev/null differ
diff --git a/static/favicon.png b/static/favicon.png
deleted file mode 100644
index 48371841ee942974030a8bf6469febf1a081695b..0000000000000000000000000000000000000000
--- a/static/favicon.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:579585886ddea743aa3e212e698632f315c6130d5d6dd3287a015011dbb8fc3a
-size 779
diff --git a/static/scroll.svg b/static/scroll.svg
deleted file mode 100644
index 9a430b354d0ee5bbe861fcd4dcc637ce48fb3905..0000000000000000000000000000000000000000
--- a/static/scroll.svg
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
\ No newline at end of file
diff --git a/.DS_Store b/streamlit/.DS_Store
similarity index 67%
rename from .DS_Store
rename to streamlit/.DS_Store
index 8b1cf4613515d5bb678b199e9ed7d7bd425758c6..02ff0e872c9af3306af9533047a7c8e73b93eea6 100644
Binary files a/.DS_Store and b/streamlit/.DS_Store differ
diff --git a/streamlit/.gitattributes b/streamlit/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..9712efeb00e191c078f0a36993a34be7b7d5bada
--- /dev/null
+++ b/streamlit/.gitattributes
@@ -0,0 +1,39 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
+input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
+input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
+input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/streamlit/config.py b/streamlit/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a089ed5e6c6d0c1f08b9b4a6fc3118f31e4b48e4
--- /dev/null
+++ b/streamlit/config.py
@@ -0,0 +1,28 @@
+# config.py
+"""
+Configuration file for OCR processing.
+Contains model settings.
+"""
+import os
+
+# API key is set by the backend - not user facing
+MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
+
+# Model settings
+OCR_MODEL = "mistral-ocr-latest"
+TEXT_MODEL = "mistral-small-latest"
+VISION_MODEL = "pixtral-12b-latest"
+
+# System message for historical document processing
+SYSTEM_MESSAGE = """You are an AI Assistant with document understanding for historical materials. You will be provided with documents, and you must answer any questions related to those materials.
+
+# HISTORICAL DOCUMENT UNDERSTANDING
+When analyzing historical documents, consider these important factors:
+1. Physical Condition: Documents may be degraded, damaged, or faded over time.
+2. Historical Context: Consider the time period, cultural context, and historical significance.
+3. Annotation: Documents may contain handwritten notes, stamps, or additions from various time periods.
+4. Layout Complexity: Historical documents often have complex layouts, multiple columns, and varied typography.
+5. Mixed Media: Documents may combine text, imagery, diagrams, and decorative elements.
+6. Language Evolution: Historical language, terminology, and abbreviations may differ from modern usage.
+
+Provide thoughtful analysis that acknowledges these complexities and maintains scholarly integrity when interpreting historical materials."""
\ No newline at end of file
diff --git a/streamlit/custom.css b/streamlit/custom.css
new file mode 100644
index 0000000000000000000000000000000000000000..e701ad662756ea4df4382bc4e124401e4581bae8
--- /dev/null
+++ b/streamlit/custom.css
@@ -0,0 +1,303 @@
+/* Base Tailwind-like styles */
+:root {
+ --color-gray-900: #111827;
+ --color-gray-800: #1f2937;
+ --color-gray-700: #374151;
+ --color-gray-600: #4B5563;
+ --color-gray-500: #6B7280;
+ --color-gray-400: #9CA3AF;
+ --color-gray-300: #D1D5DB;
+ --color-gray-200: #E5E7EB;
+ --color-gray-100: #F3F4F6;
+ --color-gray-50: #F9FAFB;
+
+ --color-blue-900: #1E3A8A;
+ --color-blue-800: #1E40AF;
+ --color-blue-700: #1D4ED8;
+ --color-blue-600: #2563EB;
+ --color-blue-500: #3B82F6;
+ --color-blue-400: #60A5FA;
+ --color-blue-300: #93C5FD;
+ --color-blue-200: #BFDBFE;
+ --color-blue-100: #DBEAFE;
+ --color-blue-50: #EFF6FF;
+
+ --color-yellow-50: #FFFBEB;
+ --color-yellow-100: #FEF3C7;
+}
+
+/* Global Styles */
+.stApp {
+ background-color: var(--color-gray-900);
+ color: white;
+}
+
+/* Main header */
+.main-header {
+ background-color: black;
+ padding: 1rem;
+ border-bottom: 1px solid var(--color-gray-700);
+}
+
+.title-text {
+ font-size: 1.5rem;
+ font-weight: bold;
+ color: white;
+}
+
+/* Content containers */
+.content-container {
+ background-color: var(--color-gray-800);
+ border-radius: 0.75rem;
+ padding: 1.5rem;
+ margin-bottom: 1.5rem;
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2);
+ color: white;
+}
+
+.blue-container {
+ background-color: var(--color-blue-100);
+ color: var(--color-gray-900);
+ border-radius: 0.75rem;
+ padding: 1.5rem;
+ margin-bottom: 1.5rem;
+}
+
+.yellow-container {
+ background-color: var(--color-yellow-50);
+ color: var(--color-gray-900);
+ border-radius: 0.75rem;
+ padding: 1.5rem;
+ margin-bottom: 1.5rem;
+}
+
+/* Card grid styles */
+.card-grid {
+ display: grid;
+ grid-template-columns: repeat(1, 1fr);
+ gap: 1.5rem;
+ margin-bottom: 1.5rem;
+}
+
+@media (min-width: 768px) {
+ .card-grid {
+ grid-template-columns: repeat(3, 1fr);
+ }
+}
+
+.card {
+ background-color: var(--color-gray-700);
+ border-radius: 0.5rem;
+ padding: 1rem;
+ color: white;
+}
+
+/* Special containers */
+.key-concept {
+ background-color: var(--color-gray-700);
+ border-radius: 0.5rem;
+ padding: 0.75rem;
+ margin: 1rem 0;
+ border-left: 3px solid var(--color-blue-500);
+ color: white;
+}
+
+.research-question {
+ background-color: var(--color-blue-900);
+ border-radius: 0.5rem;
+ padding: 0.75rem;
+ margin: 1rem 0;
+ border-left: 3px solid var(--color-blue-400);
+ color: white;
+}
+
+.quote-container {
+ font-style: italic;
+ color: var(--color-gray-300);
+ padding: 0.5rem 1rem;
+ border-left: 3px solid var(--color-gray-600);
+ margin: 1rem 0;
+}
+
+/* Navigation */
+.nav-container {
+ position: fixed;
+ bottom: 0;
+ left: 0;
+ width: 100%;
+ background-color: black;
+ border-top: 1px solid var(--color-gray-700);
+ padding: 0.75rem 1rem;
+ display: flex;
+ justify-content: space-between;
+ z-index: 1000;
+}
+
+.nav-buttons {
+ display: flex;
+ gap: 0.5rem;
+}
+
+.prev-button {
+ background-color: var(--color-gray-700);
+ color: white;
+ padding: 0.5rem 1rem;
+ border-radius: 0.25rem;
+ border: none;
+ cursor: pointer;
+}
+
+.prev-button:hover {
+ background-color: var(--color-gray-600);
+}
+
+.next-button {
+ background-color: var(--color-blue-600);
+ color: white;
+ padding: 0.5rem 1rem;
+ border-radius: 0.25rem;
+ border: none;
+ cursor: pointer;
+}
+
+.next-button:hover {
+ background-color: var(--color-blue-700);
+}
+
+.nav-dots {
+ display: none;
+}
+
+@media (min-width: 768px) {
+ .nav-dots {
+ display: flex;
+ gap: 0.25rem;
+ }
+}
+
+.nav-dot {
+ width: 2rem;
+ height: 2rem;
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ color: var(--color-gray-300);
+ border-radius: 0.25rem;
+ text-decoration: none;
+ font-size: 0.875rem;
+}
+
+.nav-dot:hover {
+ background-color: var(--color-gray-800);
+}
+
+.nav-dot.active {
+ background-color: var(--color-blue-800);
+ color: white;
+ font-weight: 500;
+}
+
+/* Override Streamlit Styles */
+.stTextInput > div > div > input {
+ background-color: var(--color-gray-700);
+ color: white;
+}
+
+.stSelectbox > div > div > div {
+ background-color: var(--color-gray-700);
+ color: white;
+}
+
+.stCheckbox > div > label {
+ color: white;
+}
+
+/* Button styling */
+.stButton > button {
+ background-color: var(--color-blue-600);
+ color: white;
+}
+
+.stButton > button:hover {
+ background-color: var(--color-blue-700);
+}
+
+/* Sidebars */
+[data-testid="stSidebar"] {
+ background-color: var(--color-gray-900);
+}
+
+[data-testid="stSidebar"] .stMarkdown {
+ color: white;
+}
+
+/* Module card styles */
+.module-card {
+ background-color: var(--color-gray-800);
+ border-radius: 0.5rem;
+ padding: 1rem;
+ margin-bottom: 1rem;
+ border-top: 4px solid var(--color-blue-500);
+ transition: transform 0.2s;
+}
+
+.module-card:hover {
+ transform: translateY(-3px);
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+}
+
+.module-number {
+ background-color: var(--color-blue-500);
+ color: white;
+ font-weight: bold;
+ padding: 0.25rem 0.5rem;
+ border-radius: 1rem;
+ font-size: 0.9rem;
+ display: inline-block;
+ margin-bottom: 0.5rem;
+}
+
+.module-title {
+ font-weight: 600;
+ margin-bottom: 0.5rem;
+ font-size: 1.1rem;
+}
+
+/* Add space at bottom for fixed nav */
+.main-content {
+ padding-bottom: 4rem;
+}
+
+/* Tool container styles */
+.tool-container {
+ background-color: var(--color-gray-800);
+ color: white;
+ padding: 1.5rem;
+ border-radius: 0.5rem;
+ border: 1px solid var(--color-gray-700);
+ margin-bottom: 1.5rem;
+}
+
+/* Upload container */
+.upload-container {
+ border: 2px dashed var(--color-gray-600);
+ padding: 1.5rem;
+ text-align: center;
+ border-radius: 0.5rem;
+ margin-bottom: 1rem;
+ background-color: var(--color-gray-700);
+}
+
+/* Footer spacing */
+.footer-spacer {
+ height: 4rem;
+}
+
+/* Tabs */
+.stTabs [data-baseweb="tab"] {
+ color: white;
+}
+
+.stTabs [data-baseweb="tab-highlight"] {
+ background-color: var(--color-blue-600);
+}
\ No newline at end of file
diff --git a/streamlit/layout.py b/streamlit/layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebcdb8c19ffde05be13e8feec5b1184c13fc4f39
--- /dev/null
+++ b/streamlit/layout.py
@@ -0,0 +1,172 @@
+import streamlit as st
+from pathlib import Path
+import os
+
+# Load custom CSS
+def load_css():
+ css_file = Path(__file__).parent / "custom.css"
+ if css_file.exists():
+ with open(css_file) as f:
+ st.markdown(f"", unsafe_allow_html=True)
+ else:
+ st.warning("Custom CSS file not found. Some styles may be missing.")
+
+# Header component
+def header():
+ st.markdown("""
+
+
Historical OCR Workshop
+
+ """, unsafe_allow_html=True)
+
+# Create a page wrapper similar to the React component
+def page_wrapper(content_function, current_module=1):
+ """
+ Creates a consistent page layout with navigation
+ Args:
+ content_function: Function that renders the page content
+ current_module: Current module number (1-6)
+ """
+ # Load custom CSS
+ load_css()
+
+ # Display header
+ header()
+
+ # Ensure session state for navigation
+ if 'current_module' not in st.session_state:
+ st.session_state.current_module = current_module
+
+ # Main content area with bottom padding for the nav
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Call the content function to render the module content
+ content_function()
+
+ # Add spacer for fixed nav
+ st.markdown('', unsafe_allow_html=True)
+
+ # Navigation
+ render_navigation(current_module)
+
+ st.markdown('
+ """, unsafe_allow_html=True)
+
+# Previous button HTML
+def prev_button_html(current_module, modules):
+ if current_module > 1:
+ prev_module = current_module - 1
+ return f"""
+
+ """
+ return ""
+
+# Next button HTML
+def next_button_html(current_module, modules):
+ if current_module < len(modules):
+ next_module = current_module + 1
+ return f"""
+
+ """
+ return ""
+
+# Navigation dots HTML
+def nav_dots_html(current_module, modules):
+ dots_html = ""
+ for i, name in enumerate(modules, 1):
+ active_class = "active" if i == current_module else ""
+ dots_html += f"""
+
+ {i}
+
+ """
+ return dots_html
+
+# Helper functions for container styles
+def gray_container(content, padding="1.5rem"):
+ """Renders content in a gray container with consistent styling"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def blue_container(content, padding="1.5rem"):
+ """Renders content in a blue container with consistent styling"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def yellow_container(content, padding="1.5rem"):
+ """Renders content in a yellow container with consistent styling"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def card_grid(cards):
+ """
+ Renders a responsive grid of cards
+ Args:
+ cards: List of HTML strings for each card
+ """
+ grid_html = '
', unsafe_allow_html=True)
+
+def research_question(content):
+ """Renders a research question box"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def quote(content, author=""):
+ """Renders a quote with optional author"""
+ quote_html = f'
{content}'
+ if author:
+ quote_html += f'
— {author}'
+ quote_html += '
'
+ st.markdown(quote_html, unsafe_allow_html=True)
+
+def tool_container(content):
+ """Renders content in a tool container"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def upload_container(content):
+ """Renders content in an upload container"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
\ No newline at end of file
diff --git a/ui/.DS_Store b/streamlit/modules/.DS_Store
similarity index 100%
rename from ui/.DS_Store
rename to streamlit/modules/.DS_Store
diff --git a/streamlit/modules/__init__.py b/streamlit/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5767eca0096d8d3be1514fcb79c97e0f4caf7b
--- /dev/null
+++ b/streamlit/modules/__init__.py
@@ -0,0 +1,36 @@
+"""
+Module initialization file for the workshop modules.
+"""
+from . import module1, module2, module3, module4, module5, module6
+
+# Module registry for easy access by module number
+modules = {
+ 1: module1,
+ 2: module2,
+ 3: module3,
+ 4: module4,
+ 5: module5,
+ 6: module6
+}
+
+# Module names for navigation and display
+module_names = [
+ "Introduction",
+ "Text-Image Relations",
+ "OCR Technology",
+ "Methodological Approaches",
+ "Interactive OCR",
+ "Conclusion"
+]
+
+def get_module(module_number):
+ """Get a module by its number (1-6)"""
+ if module_number in modules:
+ return modules[module_number]
+ raise ValueError(f"Unknown module number: {module_number}")
+
+def get_module_name(module_number):
+ """Get a module name by its number (1-6)"""
+ if 1 <= module_number <= len(module_names):
+ return module_names[module_number - 1]
+ return f"Module {module_number}"
\ No newline at end of file
diff --git a/streamlit/modules/module1.py b/streamlit/modules/module1.py
new file mode 100644
index 0000000000000000000000000000000000000000..822f36dd501269596ad5f37b510027a362eb375a
--- /dev/null
+++ b/streamlit/modules/module1.py
@@ -0,0 +1,85 @@
+import streamlit as st
+from layout import gray_container, blue_container, yellow_container, card_grid, key_concept
+
+def render():
+ """Module 1: Introduction and Problematization"""
+
+ st.title("Module 1: Introduction and Problematization")
+
+ # Workshop overview in gray container
+ overview_content = """
+
Workshop Overview
+
+ This interactive workshop explores the application of OCR technology to historical documents,
+ combining theoretical understanding with practical experiences. Designed for historians,
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
+
+ """
+ gray_container(overview_content)
+
+ # For historians section with blue background
+ historians_content = """
+
For Historians:
+
+ How might OCR technology transform our access to and interpretation of historical
+ documents? What new research questions become possible when large archives
+ become machine-readable?
+
+ """
+ blue_container(historians_content)
+
+ # What is OCR section with yellow background
+ ocr_content = """
+
What is OCR?
+
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
+ Modern OCR uses AI vision models to understand both the text and its visual context.
+
How might the capabilities of vision-language models change our approach to digitizing historical archives?
+ """
+ research_question(research_content)
+
+ # Display history if available
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
+ with st.expander("Your OCR Processing History"):
+ st.markdown("You've already processed the following documents:")
+
+ for item in st.session_state.processing_history:
+ st.markdown(f"**{item['fileName']}**")
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
+ with col2:
+ st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
\ No newline at end of file
diff --git a/streamlit/modules/module4.py b/streamlit/modules/module4.py
new file mode 100644
index 0000000000000000000000000000000000000000..81dac95f5f7b62960b00dbe68f0ed1b7be357812
--- /dev/null
+++ b/streamlit/modules/module4.py
@@ -0,0 +1,124 @@
+import streamlit as st
+from pathlib import Path
+from layout import gray_container, tool_container, key_concept, quote
+
+def render():
+ """Module 4: Methodological Approaches"""
+
+ st.title("Module 4: Methodological Approaches")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ hybrid_content = """
+
Hybrid Methodologies
+
+
1. Computational + Human Reading
+
+
OCR for initial processing and discovery
+
Human review for context and interpretation
+
Iterative refinement of computational outputs
+
+
+
2. Close + Distant Reading
+
+
Distant reading through large-scale OCR processing
+
Close reading of selected passages
+
Zooming between scales of analysis
+
+ """
+ gray_container(hybrid_content)
+
+ # Check if the diagram image is available and display it
+ input_dir = Path(__file__).parent.parent / "input"
+ diagram_path = input_dir / "diagram.jpg"
+
+ if diagram_path.exists():
+ try:
+ from PIL import Image
+ with Image.open(diagram_path) as img:
+ st.image(img, caption="Historical VLM architecture", use_column_width=True)
+ except Exception:
+ # If there's an error, just show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
+ else:
+ # If the file doesn't exist, show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
+
+ with col2:
+ mistral_content = """
+
Mistral-OCR-Latest: State-of-the-Art
+
+
The Mistral-OCR model represents a significant advancement:
+
+
Multimodal Understanding: Processes both visual and textual information
Historical Font Adaptation: Trained on diverse historical typography
+
+ """
+ gray_container(mistral_content)
+
+ # Check if the workflow image is available and display it
+ workflow_path = input_dir / "workflow.jpg"
+
+ if workflow_path.exists():
+ try:
+ from PIL import Image
+ with Image.open(workflow_path) as img:
+ st.image(img, caption="Mistral OCR workflow", use_column_width=True)
+ except Exception:
+ # If there's an error, just show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
+ else:
+ # If the file doesn't exist, show a placeholder
+ st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
+
+ # Practical workflow section
+ workflow_content = """
+
Practical Workflow
+
+
A typical historical OCR workflow with Mistral-OCR includes:
+
+
Selection: Choosing appropriate documents
+
Preprocessing: Enhancing images before OCR
+
OCR Processing: Running documents through vision-enhanced OCR
+
Post-processing: Cleaning up outputs and structured extraction
+
Verification: Cross-checking results against originals
+
Integration: Incorporating OCR outputs into research materials
+ """
+ gray_container(limitations_content)
+
+ # Quote
+ quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
+ quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")
\ No newline at end of file
diff --git a/streamlit/modules/module5.py b/streamlit/modules/module5.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bc50f5e03de8e234a6c3369da6b68ce4a0faee0
--- /dev/null
+++ b/streamlit/modules/module5.py
@@ -0,0 +1,547 @@
+import streamlit as st
+import io
+import tempfile
+from pathlib import Path
+from datetime import datetime
+from layout import tool_container, key_concept, research_question, upload_container
+import sys
+
+# Import the necessary modules for OCR processing
+sys.path.append(str(Path(__file__).parent.parent))
+try:
+ from process_file import process_file as process_file_util
+ process_file = process_file_util
+except ImportError:
+ # Fallback if process_file is not available
+ def process_file(uploaded_file, use_vision=True, custom_prompt=None):
+ """Fallback function for processing files"""
+ st.warning("Using mock processing function. Real OCR functionality is not available.")
+ return {
+ "file_name": uploaded_file.name,
+ "languages": ["English"],
+ "topics": ["History", "Document"],
+ "ocr_contents": {
+ "content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
+ }
+ }
+
+def render():
+ """Module 5: Interactive OCR Experiment"""
+
+ st.title("Module 5: Interactive OCR Experiment")
+
+ # Introduction to the interactive experiment
+ intro_content = """
+
Interactive OCR Experiment
+
+ This interactive experiment allows you to process historical documents with OCR and analyze the results.
+ Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
+
+ """
+ st.markdown(intro_content, unsafe_allow_html=True)
+
+ # Create tabs for different activities
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
+
+ # Try to import PDF tools if available
+ try:
+ from pdf2image import convert_from_bytes
+ pdf_support = True
+ except ImportError:
+ pdf_support = False
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
+
+ with experiment_tab:
+ # Create a two-column layout
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ # Tool container for document selection and options
+ st.subheader("Step 1: Select Document & Options")
+
+ # Processing options
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="Use the vision model for improved analysis")
+
+ # Additional prompt
+ st.markdown("### Custom Research Prompt (Optional)")
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
+ Focus on specific aspects of historical research you're interested in.""")
+ custom_prompt = st.text_area("Research Prompt",
+ placeholder="E.g., Focus on identifying dates and historical figures...",
+ help="Optional instructions to guide the analysis")
+
+ # Sample document selection
+ input_dir = Path(__file__).parent.parent / "input"
+
+ if input_dir.exists():
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
+
+ if sample_files:
+ st.markdown("#### Sample Documents")
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
+ sample_choice = st.selectbox("Choose a document:", sample_options)
+
+ if sample_choice != "Upload my own document":
+ # Process the selected sample file
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
+
+ if selected_file:
+ # Store the selected sample file in session state
+ with open(selected_file, "rb") as f:
+ file_bytes = f.read()
+
+ st.session_state.sample_file = {
+ "name": selected_file.name,
+ "bytes": file_bytes
+ }
+
+ # Preview the selected sample
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
+ try:
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"PDF selected: {selected_file.name}")
+ else:
+ # For images display directly
+ try:
+ from PIL import Image
+ img = Image.open(io.BytesIO(file_bytes))
+ st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"Selected: {selected_file.name}")
+ else:
+ # Clear the sample file if "Upload my own" is selected
+ if 'sample_file' in st.session_state:
+ del st.session_state.sample_file
+
+ # Display file uploader
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is not None:
+ # Display preview of the uploaded file
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ else:
+ # No sample files, just show the uploader
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is not None:
+ # Display the file preview
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_column_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ else:
+ # No input directory
+ upload_html = """
+
Upload a document to get started
+
Supported formats: PDF, JPG, PNG
+ """
+
+ upload_container(upload_html)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ # Process button
+ st.subheader("Step 2: Process the Document")
+
+ # Get the file to process (either uploaded or sample)
+ file_to_process = None
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
+ # Create a FileUploader-like object from the sample file
+ class SampleFileObject:
+ def __init__(self, name, data):
+ self.name = name
+ self._data = data
+
+ def getvalue(self):
+ return self._data
+
+ file_to_process = SampleFileObject(
+ st.session_state.sample_file["name"],
+ st.session_state.sample_file["bytes"]
+ )
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
+ file_to_process = uploaded_file
+
+ # Process button
+ process_button = st.button(
+ "Process Document",
+ disabled=file_to_process is None,
+ use_container_width=True
+ )
+
+ if process_button and file_to_process is not None:
+ with st.spinner("Processing document..."):
+ try:
+ # Process the file
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
+
+ if result:
+ st.success("Document processed successfully!")
+
+ # Store result in session state for display in the right column
+ st.session_state.current_result = result
+
+ # Add to processing history
+ history_item = {
+ "id": datetime.now().timestamp(),
+ "fileName": file_to_process.name,
+ "timestamp": datetime.now().isoformat(),
+ "result": result,
+ "useVision": use_vision
+ }
+
+ if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+
+ st.session_state.processing_history.append(history_item)
+
+ st.experimental_rerun()
+ else:
+ st.error("Failed to process document.")
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+
+ # Experiment instructions
+ experiment_content = """
+
Experiment Instructions
+
+
Step 1: Select a document and choose your options
+
Step 2: Process the document with the selected options
+
Step 3: Analyze the results in the panel on the right
+
Step 4: Try again with different settings (e.g., toggle vision model)
+
Step 5: Compare results between different runs
+
+ """
+ key_concept(experiment_content)
+
+ with col2:
+ # Results display
+ st.subheader("Step 3: View Results")
+
+ if 'current_result' in st.session_state and st.session_state.current_result:
+ result = st.session_state.current_result
+
+ # Display results in a tool container
+ result_html = f"""
+
Vision model used: {'Yes' if latest['useVision'] else 'No'}
+ """
+ tool_container(latest_html)
+
+ # History in expander
+ with st.expander("View Complete Processing History"):
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
+ st.markdown(f"""
+
You're comparing the same document processed with different models.
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.
Process at least two documents to enable side-by-side comparison. Try processing
+ the same document with and without the vision model to see the differences in OCR quality.
+ """
+ research_question(need_more_content)
+
+ # Analysis guide tab
+ with analyze_tab:
+ st.subheader("Analysis Guide")
+
+ st.markdown("""
+ ### How to Analyze OCR Results
+
+ When analyzing OCR results from historical documents, consider these key factors:
+
+ 1. **Text Accuracy**
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
+ - Assess recognition of period-specific typography and writing styles
+ - Evaluate handling of degraded or damaged text areas
+
+ 2. **Structure Preservation**
+ - Does the OCR maintain paragraph and section breaks?
+ - Are columns and tabular data correctly preserved?
+ - How well are page transitions handled?
+
+ 3. **Special Elements**
+ - Recognition of footnotes, marginalia, and annotations
+ - Handling of illustrations, diagrams, and decorative elements
+ - Treatment of watermarks, signatures, and stamps
+
+ 4. **Metadata Extraction**
+ - Accuracy of detected languages, topics, and document type
+ - Identification of dates, names, and key entities
+ - Recognition of document purpose and context
+ """)
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ challenge_content = """
+
Common OCR Challenges
+
+
Typography Variations: Historical fonts that differ from modern text
+
Material Degradation: Fading, stains, tears affecting legibility
+
Handwritten Elements: Marginalia, signatures, and annotations
+
Complex Layouts: Multi-column formats and decorative elements
+
Language and Terminology: Archaic terms and multilingual content
Contextual Reading: Use context to interpret unclear passages
+
Error Patterns: Identify and correct systematic OCR errors
+
Hybrid Analysis: Combine OCR search with close reading
+
Comparative Processing: Try different settings on documents
+
Iterative Refinement: Use insights to improve future processing
+
+ """
+ gray_container(tips_content)
+
+ # Show example analysis if there's processing history
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
+ with st.expander("Example Analysis from Your Documents"):
+ # Pick the latest document
+ latest = st.session_state.processing_history[-1]
+
+ st.markdown(f"""
+ #### Sample Analysis for: {latest['fileName']}
+
+ **Document Context:**
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
+
+ **What to Look For:**
+ 1. Check how well the model identified key topics and languages
+ 2. Evaluate the completeness of extracted text
+ 3. Note any systematic errors in text recognition
+ 4. Assess how well document structure was preserved
+ """)
\ No newline at end of file
diff --git a/streamlit/modules/module6.py b/streamlit/modules/module6.py
new file mode 100644
index 0000000000000000000000000000000000000000..5efa8ee87b1f737172445d3f7caa20dd79c588b4
--- /dev/null
+++ b/streamlit/modules/module6.py
@@ -0,0 +1,154 @@
+import streamlit as st
+from layout import gray_container, key_concept, quote, tool_container
+from datetime import datetime
+
+def render():
+ """Module 6: Conclusion and Future Directions"""
+
+ st.title("Module 6: Conclusion and Future Directions")
+
+ col1, col2 = st.columns([3, 2])
+
+ with col1:
+ summary_content = """
+
Workshop Summary
+
Throughout this workshop, we've explored:
+
+
Text-Image Interdependence: The complex relationship between textual and visual elements
+
OCR Technology: The evolution of OCR and its application to historical materials
+
Methodological Approaches: Hybrid strategies for working with historical texts
+
Practical Application: Hands-on experience with OCR processing tools
+ """
+ gray_container(research_content)
+
+ # Inspiring quote
+ quote_content = "The digital humanities are not about building, they're about sharing. The digital humanities are not about the digital at all. They're all about innovation and disruption. The digital humanities are really an insurgent humanities."
+ quote(quote_content, "Matthew Kirschenbaum, Professor of Digital Humanities")
+
+ # Additional resources
+ resources_content = """
+
This workshop was designed as an educational resource for historians, archivists, and digital humanities scholars.
+
It demonstrates the integration of modern AI vision-language models with historical research methodologies.
+
Special thanks to the digital humanities community for continued innovation in computational approaches to historical research.
+ """
+ st.markdown(acknowledgment_content, unsafe_allow_html=True)
+
+ # Restart the workshop button
+ if st.button("Start Workshop Again", use_container_width=True):
+ # Reset the session state to start the workshop again
+ if 'current_module' in st.session_state:
+ st.session_state.current_module = 1
+
+ # Do not reset the processing history
+
+ st.experimental_rerun()
\ No newline at end of file
diff --git a/streamlit/packages.txt b/streamlit/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c1298f5aebdba9b2e9b819d036559cac9e687863
--- /dev/null
+++ b/streamlit/packages.txt
@@ -0,0 +1,2 @@
+tesseract-ocr
+poppler-utils
\ No newline at end of file
diff --git a/streamlit/process_file.py b/streamlit/process_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7dfb5bde566f22aa86a9e8104e88bf5989365ee
--- /dev/null
+++ b/streamlit/process_file.py
@@ -0,0 +1,66 @@
+"""
+Utility function for processing files with OCR in the Historical OCR Workshop app.
+"""
+
+import os
+import tempfile
+from pathlib import Path
+from datetime import datetime
+
+def process_file(uploaded_file, use_vision=True, processor=None, custom_prompt=None):
+ """Process the uploaded file and return the OCR results
+
+ Args:
+ uploaded_file: The uploaded file to process
+ use_vision: Whether to use vision model
+ processor: StructuredOCR processor (if None, it will be imported)
+ custom_prompt: Optional additional instructions for the model
+
+ Returns:
+ dict: The OCR results
+ """
+ # Import the processor if not provided
+ if processor is None:
+ from structured_ocr import StructuredOCR
+ processor = StructuredOCR()
+
+ # Save the uploaded file to a temporary file
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+ tmp.write(uploaded_file.getvalue())
+ temp_path = tmp.name
+
+ try:
+ # Determine file type from extension
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ file_type = "pdf" if file_ext == ".pdf" else "image"
+
+ # Get file size in MB
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+
+ # Process the file with file size information for automatic page limiting
+ result = processor.process_file(
+ temp_path,
+ file_type=file_type,
+ use_vision=use_vision,
+ file_size_mb=file_size_mb,
+ custom_prompt=custom_prompt
+ )
+
+ # Add processing metadata
+ result.update({
+ "file_name": uploaded_file.name,
+ "processed_at": datetime.now().isoformat(),
+ "file_size_mb": round(file_size_mb, 2),
+ "use_vision": use_vision
+ })
+
+ return result
+ except Exception as e:
+ return {
+ "error": str(e),
+ "file_name": uploaded_file.name
+ }
+ finally:
+ # Clean up the temporary file
+ if os.path.exists(temp_path):
+ os.unlink(temp_path)
\ No newline at end of file
diff --git a/streamlit/requirements.txt b/streamlit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..76bdca7345b8cabe8acbfcb8b3080597fc47150f
--- /dev/null
+++ b/streamlit/requirements.txt
@@ -0,0 +1,9 @@
+streamlit>=1.43.2
+mistralai
+pydantic
+pycountry
+pillow
+python-multipart
+pdf2image
+pytesseract
+streamlit-javascript
\ No newline at end of file
diff --git a/streamlit/run_local.sh b/streamlit/run_local.sh
new file mode 100644
index 0000000000000000000000000000000000000000..af6e179c7fd64fac7a11852f791128e99a6adf1a
--- /dev/null
+++ b/streamlit/run_local.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Run the Streamlit app locally
+
+# No longer need to add parent directory to PYTHONPATH
+# as we now have a local copy of structured_ocr.py
+
+# Load environment variables from .env file if it exists
+if [ -f .env ]; then
+ echo "Loading environment variables from .env file"
+ set -o allexport
+ source .env
+ set +o allexport
+else
+ echo "No .env file found. Make sure to set MISTRAL_API_KEY environment variable manually."
+fi
+
+# Check if MISTRAL_API_KEY is set
+if [ -z "$MISTRAL_API_KEY" ]; then
+ echo "WARNING: MISTRAL_API_KEY is not set. The app will run with sample data."
+else
+ echo "MISTRAL_API_KEY is set. The app will use the Mistral API for OCR processing."
+fi
+
+# Run the Streamlit app
+# We can run either app.py (which now imports streamlit_app.py) or streamlit_app.py directly
+streamlit run streamlit_app.py
\ No newline at end of file
diff --git a/streamlit/setup_git.sh b/streamlit/setup_git.sh
new file mode 100644
index 0000000000000000000000000000000000000000..331be4e7e41b38ce4ccce2f3efaf6b0a64f4eead
--- /dev/null
+++ b/streamlit/setup_git.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Setup git repository for Hugging Face Spaces
+
+# Check if HF_TOKEN environment variable is set
+if [ -z "$HF_TOKEN" ]; then
+ echo "Error: HF_TOKEN environment variable is not set."
+ echo "Please set it first with: export HF_TOKEN=your_hugging_face_token"
+ exit 1
+fi
+
+# Get your username
+echo "Enter your Hugging Face username:"
+read HF_USERNAME
+
+# Get the space name
+echo "Enter the name for your Hugging Face Space (e.g., historical-ocr):"
+read HF_SPACE
+
+# Prepare the files for deployment
+echo "Preparing files for deployment..."
+python3 prepare_for_hf.py
+
+# Initialize git
+git init
+git add .
+git commit -m "Initial commit"
+
+# Create the repository on Hugging Face
+echo "Creating and pushing to Hugging Face Space..."
+git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE
+huggingface-cli login --token $HF_TOKEN
+git push -u origin main
+
+echo "Deployment completed! Your app should be available at:"
+echo "https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE"
\ No newline at end of file
diff --git a/streamlit/streamlit_app.py b/streamlit/streamlit_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..f941891f42dbc951d73a962a9b7f445e166ef07e
--- /dev/null
+++ b/streamlit/streamlit_app.py
@@ -0,0 +1,2028 @@
+import os
+import streamlit as st
+import json
+import sys
+from pathlib import Path
+import tempfile
+from datetime import datetime
+import io
+import base64
+from io import BytesIO
+from enum import Enum
+import inspect
+
+# Add parent directory to path so we can import the OCR modules
+parent_dir = Path(__file__).parent.absolute()
+sys.path.append(str(parent_dir))
+
+# Import the StructuredOCR class and process_file utility
+from structured_ocr import StructuredOCR
+
+# Add API endpoint support for the React app
+from streamlit.web.server.server import Server
+from streamlit.runtime.scriptrunner import get_script_run_ctx
+
+# Custom JSON encoder to handle Enum types and other non-serializable objects
+class EnhancedJSONEncoder(json.JSONEncoder):
+ def default(self, obj):
+ if isinstance(obj, Enum):
+ return obj.value
+ elif hasattr(obj, '__dict__'):
+ # For objects that have a __dict__ but aren't directly serializable
+ return {key: value for key, value in obj.__dict__.items()
+ if not key.startswith('_')}
+ elif hasattr(obj, 'model_dump'):
+ # For Pydantic models
+ return obj.model_dump()
+ elif hasattr(obj, 'to_dict'):
+ # For objects with to_dict method
+ return obj.to_dict()
+ # Let the base class handle other types or raise TypeError
+ return super().default(obj)
+
+# Helper function to convert any result to JSON-serializable
+def make_serializable(obj):
+ """Convert any object to a JSON-serializable form"""
+ if isinstance(obj, dict):
+ return {k: make_serializable(v) for k, v in obj.items()}
+ elif isinstance(obj, list):
+ return [make_serializable(item) for item in obj]
+ elif isinstance(obj, Enum):
+ return obj.value
+ elif hasattr(obj, 'pages'):
+ # Special case for OCRResponse objects which have pages attribute
+ if hasattr(obj, '__dict__'):
+ result = {k: make_serializable(v) for k, v in obj.__dict__.items()
+ if not k.startswith('_')}
+ # Explicitly handle pages attribute
+ if hasattr(obj, 'pages'):
+ result['pages'] = [make_serializable(page) for page in obj.pages]
+ return result
+ elif hasattr(obj, '__dict__'):
+ # For objects with __dict__ attribute
+ return {k: make_serializable(v) for k, v in obj.__dict__.items()
+ if not k.startswith('_')}
+ elif hasattr(obj, 'model_dump'):
+ # For Pydantic models
+ return make_serializable(obj.model_dump())
+ elif hasattr(obj, 'to_dict'):
+ # For objects with to_dict method
+ return make_serializable(obj.to_dict())
+ # Basic types will be returned as is
+ return obj
+
+# API response handler
+def process_api_request():
+ """Handle API requests from the React frontend"""
+ # Get the current Streamlit session
+ ctx = get_script_run_ctx()
+ if ctx is None:
+ return
+
+ session_id = ctx.session_id
+ session_info = Server.get_current()._get_session_info(session_id)
+
+ if session_info is None:
+ return
+
+ request = session_info.uploaded_file_mgr._uploaded_files.get('file')
+
+ if not request:
+ return
+
+ # Extract file and parameters
+ uploaded_file = request[0]
+ use_vision = session_info.query_string.get('use_vision', ['true'])[0].lower() == 'true'
+
+ try:
+ # Process file
+ result = process_file(uploaded_file, use_vision=use_vision)
+
+ # Convert result to JSON-serializable format
+ serializable_result = make_serializable(result)
+
+ # Return JSON response
+ return serializable_result
+ except Exception as e:
+ # Return error response
+ return {"error": str(e)}
+
+try:
+ from process_file import process_file as process_file_util
+ # Use the utility function instead of the local function
+ process_file = process_file_util
+except ImportError:
+ # Define the process_file function if it's not available
+ def process_file(uploaded_file, use_vision=True):
+ """Process the uploaded file and return the OCR results"""
+ # Save the uploaded file to a temporary file
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+ tmp.write(uploaded_file.getvalue())
+ temp_path = tmp.name
+
+ try:
+ # Initialize OCR processor
+ processor = StructuredOCR()
+
+ # Determine file type from extension
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ file_type = "pdf" if file_ext == ".pdf" else "image"
+
+ # Process the file
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
+
+ # Add to processing history
+ history_item = {
+ "id": datetime.now().timestamp(),
+ "fileName": uploaded_file.name,
+ "timestamp": datetime.now().isoformat(),
+ "result": result,
+ "useVision": use_vision
+ }
+
+ if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+
+ st.session_state.processing_history.append(history_item)
+
+ return result
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+ return None
+ finally:
+ # Clean up the temporary file
+ if os.path.exists(temp_path):
+ os.unlink(temp_path)
+
+# Set page configuration
+st.set_page_config(
+ page_title="Historical OCR Workshop",
+ page_icon="📜",
+ layout="wide",
+ initial_sidebar_state="collapsed" # Start with sidebar collapsed for cleaner landing
+)
+
+# Custom CSS to match React dark theme and improve Streamlit integration
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+# Initialize session state for workshop progress
+if 'current_module' not in st.session_state:
+ st.session_state.current_module = 1
+if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+if 'workshop_started' not in st.session_state:
+ st.session_state.workshop_started = False
+
+def navigate_to_module(module_number):
+ """Navigate to a specific module"""
+ st.session_state.current_module = module_number
+
+# Welcome/Start screen if workshop hasn't been started
+if not st.session_state.workshop_started:
+ # Hero section with eye-catching design
+ st.markdown("""
+
+
Historical OCR Workshop
+
Unlock the potential of historical documents with modern OCR technology
+
+ This interactive workshop explores the application of OCR technology to historical documents,
+ combining theoretical understanding with practical experiences. Designed for historians,
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
+
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
+ Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for
+ historical research and digital humanities.
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ # Add an engaging research question to connect with historians
+ st.markdown("""
+
+
For Historians:
+ How might OCR technology transform our access to and interpretation of historical documents?
+ What new research questions become possible when large archives become machine-readable?
+
+ """, unsafe_allow_html=True)
+
+ # Display sample historical document images in a better format
+ input_dir = Path(__file__).parent / "input"
+ sample_images = [
+ {"path": input_dir / "letter-1.jpg", "caption": "Historical Letter"},
+ {"path": input_dir / "recipe.jpg", "caption": "Historical Recipe"}
+ ]
+
+ # Try to find any of the sample images
+ for sample in sample_images:
+ if sample["path"].exists():
+ try:
+ from PIL import Image
+ with Image.open(sample["path"]) as img:
+ # Add a better styled border and shadow
+ st.markdown(f"""
+
+
+ - Processing historical documents with OCR
+ - Analyzing and structuring extracted information
+ - Integrating OCR into research workflows
+ - Building searchable archives
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ # Workshop modules with visually appealing cards
+ st.markdown('
Workshop Modules
', unsafe_allow_html=True)
+
+ # Add some styling for the module cards
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Modules inside the col2 from earlier
+ with col2:
+ st.markdown("""
+
+
Module 1
+
Introduction and Problematization
+
Explore the challenges of historical document digitization and the potential of OCR technologies
+ to transform historical research. Examine key problems and opportunities in historical OCR.
Understand the evolution of OCR technology from pattern matching to modern AI vision-language models,
+ and how they address the unique challenges of historical documents.
Develop hybrid methodologies that combine computational processing with traditional
+ historical research practices, balancing distant and close reading.
Synthesize workshop insights and explore future directions for OCR in historical research,
+ from large-scale corpus analysis to computational paleography.
+
+ """, unsafe_allow_html=True)
+
+ # Engaging quote to inspire participation with citation - in a better styled container
+ st.markdown("""
+
+
"
+
+
The digital turn in historical research is not just about converting analog to digital;
+ it's about transforming how we access, analyze, and interpret the past.
+
+
— Dr. Jane Winters, Professor of Digital Humanities
+
+ """, unsafe_allow_html=True)
+
+ # Feature highlight before call to action with better styling
+ st.markdown("""
+
+
+
+
Workshop Highlights
+
+
+
+
Interactive Learning
+
Hands-on document processing with real-time feedback and analysis
+
+
+
+
Real Historical Documents
+
Work with authentic materials spanning different eras and formats
+
+
+
+
Vision AI Models
+
Experience state-of-the-art OCR technology powered by advanced AI
+
+
+
+
Research Applications
+
Learn to integrate OCR into historical research workflows
+
+
+
+ """, unsafe_allow_html=True)
+
+ # Enhanced start button with dynamic styling and clear call to action
+ st.markdown("""
+
+
Ready to Start Your Journey?
+
+
+
+
No installation required • Start immediately • Interactive experience
+
+
+
+ """, unsafe_allow_html=True)
+
+ # Hidden button to trigger the workshop start
+ col1, col2, col3 = st.columns([1, 1, 1])
+ with col2:
+ if st.button("Begin Workshop", key="streamlit-button", use_container_width=True, type="primary"):
+ st.session_state.workshop_started = True
+ st.rerun()
+
+# Display workshop navigation sidebar only if workshop has started
+elif st.session_state.workshop_started:
+ # Define input directory for images
+ input_dir = Path(__file__).parent / "input"
+
+ # Enhanced sidebar navigation
+ with st.sidebar:
+ st.markdown("
+ """, unsafe_allow_html=True)
+
+ # Hidden button for jump
+ if st.button("Jump to Experiment", key="jump_exp"):
+ navigate_to_module(5)
+ st.rerun()
+
+ # Workshop information in a cleaner collapsible section
+ with st.expander("About the Workshop"):
+ st.markdown("""
+ This interactive workshop explores OCR technology for historical documents.
+
+ **How to use this workshop:**
+ 1. Navigate through modules sequentially
+ 2. Expand content sections to read more
+ 3. Try the interactive OCR experiment
+ 4. Reflect on research questions
+
+ For help or more information, use the reference materials in Module 6.
+ """)
+
+ # Enhanced progress tracking
+ if st.session_state.processing_history:
+ with st.expander("Your Activity"):
+ st.markdown(f"Documents processed: {len(st.session_state.processing_history)}", unsafe_allow_html=True)
+
+ # Show the most recent document processed with better formatting
+ latest = st.session_state.processing_history[-1]
+ st.markdown(f"""
+
+ Latest document: {latest['fileName']}
+ Processed with {' vision model' if latest['useVision'] else ' basic OCR'}
+
+ """, unsafe_allow_html=True)
+
+ # Main content based on current module
+ if st.session_state.current_module == 1:
+ # MODULE 1: Introduction
+ st.title("Module 1: Introduction and Problematization")
+
+ col1, col2 = st.columns([2, 1])
+
+ with col1:
+ st.markdown("""
+ ## Historical OCR Workshop
+
+ ### The Problem
+
+ Historical documents present unique challenges for OCR technology:
+
+ - Varying typography and handwriting styles
+ - Document degradation and damage
+ - Complex layouts and formatting
+ - Multiple languages and archaic terminology
+ - Illustrations and decorative elements
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Workshop Goals
+
+ By the end of this workshop, you will:
+
+ 1. Understand text-image relationships in historical archives
+ 2. Learn about advanced OCR technology
+ 3. Explore methodological approaches
+ 4. Gain hands-on experience with OCR tools
+ 5. Develop research integration strategies
+ """)
+
+ # Next button
+ st.button("Next: Text-Image Relations", key="next_to_2", on_click=navigate_to_module, args=(2,))
+
+ elif st.session_state.current_module == 2:
+ # MODULE 2: Text-Image Relations
+ st.title("Module 2: Text-Image Relations in Historical Archives")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown("""
+ ### Textual Elements
+
+ - **Typography**: Varying fonts, sizes, and styles
+ - **Layout**: Columns, margins, and spacing
+ - **Marginalia**: Notes, comments, and additions
+ - **Decorative Text**: Illuminated letters and calligraphy
+ """)
+
+ st.markdown("""
+ ### Visual Elements
+
+ - **Illustrations**: Diagrams, maps, and artistic representations
+ - **Watermarks**: Hidden identifiers that locate documents
+ - **Damage**: Tears, stains, and fading affecting legibility
+ - **Material Features**: Paper quality and physical dimensions
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Interdependence
+
+ The relationship between text and image in historical documents exists on a complex spectrum:
+
+ - Text functions as image (decorative headings)
+ - Images function as text (symbolic representations)
+ - Layout creates meaning through visual organization
+ - Material conditions affect both textual and visual elements
+ """)
+
+ st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Book_of_Kells_folio_292r.jpg/800px-Book_of_Kells_folio_292r.jpg",
+ caption="Book of Kells - Example of text-image integration")
+
+ st.markdown("""
+ ### OCR Challenges
+
+ These complex text-image relationships create particular challenges for OCR:
+
+ 1. **Distinguishing Text from Decoration**: Where does ornamental text end and functional text begin?
+ 2. **Handling Illustrations**: Should they be processed as images or described as text?
+ 3. **Interpreting Layout**: How to capture the significance of spacing and organization?
+ 4. **Preserving Context**: Maintaining the relationship between textual and visual elements
+ """)
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: Introduction", key="prev_to_1", on_click=navigate_to_module, args=(1,))
+
+ with col2:
+ st.button("Next: OCR Technology", key="next_to_3", on_click=navigate_to_module, args=(3,))
+
+ elif st.session_state.current_module == 3:
+ # MODULE 3: OCR Technology
+ st.title("Module 3: OCR Technology and Historical Documents")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown("""
+ ### Traditional OCR Approaches
+
+ 1. **Pattern Matching**: Early OCR compared characters to templates
+ 2. **Feature Extraction**: Identifying key features of characters
+ 3. **Statistical Models**: Using probabilities to improve recognition
+ """)
+
+ st.markdown("""
+ ### Modern AI-Enhanced OCR
+
+ 1. **Neural Networks**: Deep learning models trained on vast datasets
+ 2. **Computer Vision**: Advanced image processing techniques
+ 3. **Language Models**: Contextual understanding to resolve ambiguities
+ 4. **Multimodal Models**: Integration of text, layout, and visual understanding
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Challenges with Historical Documents
+
+ Historical materials present unique difficulties:
+
+ - **Typography Variation**: Non-standardized fonts and styles
+ - **Historical Language**: Archaic vocabulary and grammar
+ - **Layout Complexity**: Non-linear arrangements
+ - **Document Degradation**: Fading, tears, stains, and damage
+ - **Material Artifacts**: Paper texture, binding shadows, etc.
+ """)
+
+ st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg",
+ caption="OCR processing layers")
+
+ # Display processing history if available
+ if st.session_state.processing_history:
+ with st.expander("Your OCR Processing History"):
+ st.markdown("You've already processed the following documents:")
+
+ for item in st.session_state.processing_history:
+ st.markdown(f"**{item['fileName']}** - {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
+ with col2:
+ st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
+
+ # Quick link to experiment
+ st.button("Jump to OCR Experiment", key="jump_to_5", on_click=navigate_to_module, args=(5,))
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: Text-Image Relations", key="prev_to_2", on_click=navigate_to_module, args=(2,))
+
+ with col2:
+ st.button("Next: Methodological Approaches", key="next_to_4", on_click=navigate_to_module, args=(4,))
+
+ elif st.session_state.current_module == 4:
+ # MODULE 4: Methodological Approaches
+ st.title("Module 4: Methodological Approaches")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown("""
+ ### Hybrid Methodologies
+
+ 1. **Computational + Human Reading**
+ - OCR for initial processing and discovery
+ - Human review for context and interpretation
+ - Iterative refinement of computational outputs
+
+ 2. **Close + Distant Reading**
+ - Distant reading through large-scale OCR processing
+ - Close reading of selected passages
+ - Zooming between scales of analysis
+ """)
+
+ # Reference to diagram.jpg
+ input_dir = Path(__file__).parent / "input"
+ diagram_path = input_dir / "diagram.jpg"
+ if diagram_path.exists():
+ try:
+ # Load image file directly from disk
+ from PIL import Image
+ with Image.open(diagram_path) as img:
+ st.image(img, caption="Historical VLM architecture", use_column_width=True)
+ except Exception:
+ pass
+
+ with col2:
+ st.markdown("""
+ ### Mistral-OCR-Latest: State-of-the-Art
+
+ The Mistral-OCR model represents a significant advancement:
+
+ - **Multimodal Understanding**: Processes both visual and textual information
+ - **Contextual Awareness**: Considers historical context
+ - **Layout Recognition**: Preserves complex document structures
+ - **Historical Font Adaptation**: Trained on diverse historical typography
+ """)
+
+ # Reference to workflow.jpg
+ workflow_path = input_dir / "workflow.jpg"
+ if workflow_path.exists():
+ try:
+ # Load image file directly from disk
+ from PIL import Image
+ with Image.open(workflow_path) as img:
+ st.image(img, caption="Mistral OCR workflow", use_column_width=True)
+ except Exception:
+ pass
+
+ st.markdown("""
+ ### Practical Workflow
+
+ A typical historical OCR workflow with Mistral-OCR includes:
+
+ 1. **Selection**: Choosing appropriate documents
+ 2. **Preprocessing**: Enhancing images before OCR
+ 3. **OCR Processing**: Running documents through vision-enhanced OCR
+ 4. **Post-processing**: Cleaning up outputs and structured extraction
+ 5. **Verification**: Cross-checking results against originals
+ 6. **Integration**: Incorporating OCR outputs into research materials
+ """)
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: OCR Technology", key="prev_to_3", on_click=navigate_to_module, args=(3,))
+
+ with col2:
+ st.button("Next: Interactive OCR Experiment", key="next_to_5", on_click=navigate_to_module, args=(5,))
+
+ elif st.session_state.current_module == 5:
+ # MODULE 5: Interactive OCR Experiment
+ st.title("Module 5: Interactive OCR Experiment")
+
+ # More modular design with sequenced steps
+ st.markdown("""
+
+ This interactive module allows you to process historical documents with OCR and analyze the results.
+ Follow the sequenced steps below to experiment with historical document analysis.
+
+ """, unsafe_allow_html=True)
+
+ # Tabbed interface for different activities
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
+
+ with experiment_tab:
+ # Import additional libraries for enhanced functionality
+ try:
+ from pdf2image import convert_from_bytes
+ pdf_support = True
+ except ImportError:
+ pdf_support = False
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
+
+ # OCR tool in a compact layout
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Step 1: Select Document & Options
", unsafe_allow_html=True)
+
+ # Processing options
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="Use the vision model for improved analysis")
+
+ # Additional prompt for the model
+ st.markdown("### Custom Research Prompt (Optional)")
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
+ Focus on specific aspects of historical research you're interested in.""")
+ custom_prompt = st.text_area("Research Prompt",
+ placeholder="E.g., Focus on identifying dates and historical figures; Analyze the writing style for period-specific terminology; Highlight any cultural or social indicators of the time period...",
+ help="Optional instructions to guide the analysis of the historical document")
+
+ # Example preset documents selection
+ input_dir = Path(__file__).parent / "input"
+ if input_dir.exists():
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
+ if sample_files:
+ st.markdown("#### Sample Documents")
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
+ sample_choice = st.selectbox("Choose a document:", sample_options)
+
+ if sample_choice != "Upload my own document":
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
+ if selected_file:
+ # Store the selected sample file in session state
+ with open(selected_file, "rb") as f:
+ file_bytes = f.read()
+ st.session_state.sample_file = {
+ "name": selected_file.name,
+ "bytes": file_bytes
+ }
+
+ # Preview the selected sample
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
+ try:
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ # Convert PIL image to bytes for Streamlit
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+ st.image(img_bytes, caption=f"Preview: {selected_file.name}", use_container_width=True)
+ except Exception:
+ st.info(f"PDF selected: {selected_file.name}")
+ else:
+ # For images display directly
+ try:
+ from PIL import Image
+ img = Image.open(BytesIO(file_bytes))
+ st.image(img, caption=f"Preview: {selected_file.name}", use_container_width=True)
+ except Exception:
+ st.info(f"Selected: {selected_file.name}")
+ else:
+ # Clear the sample file if "Upload my own" is selected
+ if 'sample_file' in st.session_state:
+ del st.session_state.sample_file
+
+ # File uploader with styling matched to React theme
+ st.markdown('
', unsafe_allow_html=True)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is None:
+ st.markdown("### Upload a document to get started")
+ st.markdown("Supported formats: PDF, JPG, PNG")
+ else:
+ # Display the uploaded file
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+
+ if images:
+ # Convert PIL image to bytes for Streamlit
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+
+ # Display the PDF preview
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ # Simply show the file name without an error message
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+
+ st.markdown('
', unsafe_allow_html=True)
+ else:
+ # No sample files found, just show the uploader
+ st.markdown('
', unsafe_allow_html=True)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is None:
+ st.markdown("### Upload a document to get started")
+ st.markdown("Supported formats: PDF, JPG, PNG")
+ else:
+ # Display the uploaded file preview
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # PDF preview logic
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ st.markdown('
', unsafe_allow_html=True)
+ else:
+ # Input directory doesn't exist, just show the uploader
+ st.markdown('
', unsafe_allow_html=True)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is None:
+ st.markdown("### Upload a document to get started")
+ st.markdown("Supported formats: PDF, JPG, PNG")
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Step 2: Process document
+ st.subheader("Step 2: Process the Document")
+
+ # Get the file to process (either uploaded or sample)
+ file_to_process = None
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
+ # Create a FileUploader-like object from the sample file
+ class SampleFileObject:
+ def __init__(self, name, data):
+ self.name = name
+ self._data = data
+
+ def getvalue(self):
+ return self._data
+
+ file_to_process = SampleFileObject(
+ st.session_state.sample_file["name"],
+ st.session_state.sample_file["bytes"]
+ )
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
+ file_to_process = uploaded_file
+
+ # Process button (disabled if no file selected)
+ process_button = st.button(
+ "Process Document",
+ disabled=file_to_process is None,
+ use_container_width=True
+ )
+
+ if process_button and file_to_process is not None:
+ with st.spinner("Processing document..."):
+ try:
+ # Process the file
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
+
+ if result:
+ st.success("Document processed successfully!")
+
+ # Store result in session state for display in the right column
+ st.session_state.current_result = result
+ st.rerun() # Refresh to show result
+ else:
+ st.error("Failed to process document.")
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Experiment instructions in a compact format
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Experiment Instructions
", unsafe_allow_html=True)
+ st.markdown("""
+ 1. **Step 1:** Select a document and choose your options
+ 2. **Step 2:** Process the document with the selected options
+ 3. **Step 3:** Analyze the results in the panel on the right
+ 4. **Step 4:** Try again with different settings (e.g., toggle vision model)
+ 5. **Step 5:** Compare results between different runs
+ """)
+ st.markdown('
', unsafe_allow_html=True)
+
+ with col2:
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Step 3: View Results
", unsafe_allow_html=True)
+
+ if 'current_result' in st.session_state and st.session_state.current_result:
+ result = st.session_state.current_result
+
+ # File info in a compact format
+ st.markdown(f"**File:** {result.get('file_name', 'Unknown')}")
+
+ # Horizontal display of metadata
+ col1, col2 = st.columns(2)
+ with col1:
+ if 'languages' in result and result['languages']:
+ languages = [lang for lang in result['languages'] if lang]
+ if languages:
+ st.markdown(f"**Languages:** {', '.join(languages)}")
+
+ with col2:
+ if 'topics' in result and result['topics']:
+ st.markdown(f"**Topics:** {', '.join(result['topics'])}")
+
+ # Create tabs for different views with inline styling to ensure visibility
+ tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ with tab1:
+ # Display in a more user-friendly format based on the content structure
+ if 'ocr_contents' in result:
+ if isinstance(result['ocr_contents'], dict):
+ for section, content in result['ocr_contents'].items():
+ if content: # Only display non-empty sections
+ st.markdown(f"
{section.replace('_', ' ').title()}
", unsafe_allow_html=True)
+
+ if isinstance(content, str):
+ st.markdown(f"
{content}
", unsafe_allow_html=True)
+ elif isinstance(content, list):
+ for item in content:
+ if isinstance(item, str):
+ st.markdown(f"
", unsafe_allow_html=True)
+ elif isinstance(content, dict):
+ for k, v in content.items():
+ st.markdown(f"
{k}: {v}
", unsafe_allow_html=True)
+
+ with tab2:
+ # Show the raw JSON for developers
+ # Convert to serializable format first
+ serializable_result = make_serializable(result)
+ st.json(serializable_result)
+
+ # Download options
+ st.markdown("### Export Results")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ # Export as JSON (using the serializable converter)
+ serializable_result = make_serializable(result)
+ json_bytes = json.dumps(serializable_result, indent=2).encode()
+ st.download_button(
+ label="Download JSON",
+ data=json_bytes,
+ file_name="ocr_results.json",
+ mime="application/json",
+ use_container_width=True
+ )
+
+ with col2:
+ # Export as text
+ if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
+ text_content = result['ocr_contents']['content']
+ st.download_button(
+ label="Download Text",
+ data=text_content.encode(),
+ file_name="ocr_text.txt",
+ mime="text/plain",
+ use_container_width=True
+ )
+ else:
+ st.markdown("""
+
+
Results will appear here
+
+
Upload and process a document to see the OCR results in this panel.
+
+
The OCR tool will:
+
+
Extract text from your document
+
Identify languages and topics
+
Provide structured content analysis
+
Generate downloadable results
+
+
+ """, unsafe_allow_html=True)
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Processing history
+ if st.session_state.processing_history:
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Step 4: Review Processing History
", unsafe_allow_html=True)
+
+ # Most recent result summary
+ latest = st.session_state.processing_history[-1]
+ st.markdown(f"**Latest Document:** {latest['fileName']}")
+ st.markdown(f"**Processed at:** {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ st.markdown(f"**Vision model used:** {'Yes' if latest['useVision'] else 'No'}")
+
+ # Full history in expander
+ with st.expander("View Complete Processing History"):
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
+ st.markdown(f"""
+
+ """, unsafe_allow_html=True)
+
+ # Add option to view a previous result
+ if st.button(f"View This Result", key=f"view_history_{i}"):
+ st.session_state.current_result = item['result']
+ st.rerun()
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Compare tab for side-by-side comparison
+ with compare_tab:
+ st.subheader("Compare OCR Results")
+
+ if len(st.session_state.processing_history) >= 2:
+ st.markdown("""
+ Select two processing results to compare side by side. This allows you to see
+ how different options (like using the vision model) affect OCR quality.
+ """)
+
+ # Create selection dropdowns for the documents
+ col1, col2 = st.columns(2)
+ with col1:
+ # First document selector
+ doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
+ for i, item in enumerate(st.session_state.processing_history)]
+ doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
+ doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
+
+ with col2:
+ # Second document selector
+ doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
+ for i, item in enumerate(st.session_state.processing_history)]
+ default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
+ doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
+ doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
+
+ # Retrieve the selected documents
+ doc1 = st.session_state.processing_history[doc_index_1]
+ doc2 = st.session_state.processing_history[doc_index_2]
+
+ # Show comparison
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown(f"### Document 1: {doc1['fileName']}")
+ st.markdown(f"**Processed at:** {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ st.markdown(f"**Vision model used:** {'Yes' if doc1['useVision'] else 'No'}")
+
+ # Display content summary
+ if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
+ if 'content' in doc1['result']['ocr_contents']:
+ content = doc1['result']['ocr_contents']['content']
+ # Display first 500 characters with word wrap
+ st.markdown(f"""
+
+ {content[:500]}{'...' if len(content) > 500 else ''}
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ st.markdown(f"### Document 2: {doc2['fileName']}")
+ st.markdown(f"**Processed at:** {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ st.markdown(f"**Vision model used:** {'Yes' if doc2['useVision'] else 'No'}")
+
+ # Display content summary
+ if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
+ if 'content' in doc2['result']['ocr_contents']:
+ content = doc2['result']['ocr_contents']['content']
+ # Display first 500 characters with word wrap
+ st.markdown(f"""
+
+ {content[:500]}{'...' if len(content) > 500 else ''}
+
+ """, unsafe_allow_html=True)
+
+ # Comparison analysis
+ if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
+ st.markdown("""
+
+
Vision vs. Non-Vision Model Comparison
+
You're comparing the same document processed with different models.
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.
Process at least two documents to enable side-by-side comparison. Try processing
+ the same document with and without the vision model to see the differences in OCR quality.
+
+ """, unsafe_allow_html=True)
+
+ # Analysis tab for guidance on working with OCR results
+ with analyze_tab:
+ st.subheader("Analysis Guide")
+
+ st.markdown("""
+
+
How to Analyze OCR Results
+
This guide helps you assess the quality and usefulness of OCR output for historical research.
+
+ """, unsafe_allow_html=True)
+
+ st.markdown("""
+ ### Evaluating OCR Quality
+
+ When analyzing OCR results from historical documents, consider these key factors:
+
+ 1. **Text Accuracy**
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
+ - Assess recognition of period-specific typography and writing styles
+ - Evaluate handling of degraded or damaged text areas
+
+ 2. **Structure Preservation**
+ - Does the OCR maintain paragraph and section breaks?
+ - Are columns and tabular data correctly preserved?
+ - How well are page transitions handled?
+
+ 3. **Special Elements**
+ - Recognition of footnotes, marginalia, and annotations
+ - Handling of illustrations, diagrams, and decorative elements
+ - Treatment of watermarks, signatures, and stamps
+
+ 4. **Metadata Extraction**
+ - Accuracy of detected languages, topics, and document type
+ - Identification of dates, names, and key entities
+ - Recognition of document purpose and context
+ """)
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("""
+ ### Common OCR Challenges in Historical Documents
+
+ - **Typography Variations**: Historical fonts and writing styles that differ from modern text
+ - **Material Degradation**: Fading, stains, tears, and other damage affecting legibility
+ - **Handwritten Elements**: Marginalia, signatures, and handwritten annotations
+ - **Complex Layouts**: Multi-column formats, non-linear reading order, and decorative elements
+ - **Language and Terminology**: Archaic terms, specialized vocabulary, and multilingual content
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Making the Most of OCR Results
+
+ - **Contextual Reading**: Use historical context to interpret unclear passages
+ - **Error Patterns**: Identify and mentally correct for systematic OCR errors
+ - **Hybrid Analysis**: Combine OCR-based search with close reading of original images
+ - **Comparative Processing**: Try different OCR settings and models on the same document
+ - **Iterative Refinement**: Use insights from each document to improve future processing
+ """)
+
+ st.markdown("""
+ ### Research Integration
+
+ Once you've obtained and analyzed OCR results from historical documents, consider these approaches for integrating them into your research:
+
+ 1. **Digital Corpus Building**: Create searchable collections of processed texts
+ 2. **Computational Analysis**: Apply text mining, topic modeling, or network analysis
+ 3. **Cross-Document Linking**: Identify connections across multiple sources
+ 4. **Annotation and Enrichment**: Add context, translations, or explanatory notes
+ 5. **Collaborative Research**: Share processed texts with other researchers
+
+ Remember that OCR is a tool to assist your research, not replace careful reading and analysis. The most effective approaches combine computational methods with traditional historical research practices.
+ """)
+
+ # Example of what to look for
+ if st.session_state.processing_history:
+ with st.expander("Example Analysis from Your Documents"):
+ # Pick the latest document
+ latest = st.session_state.processing_history[-1]
+
+ st.markdown(f"""
+ #### Sample Analysis for: {latest['fileName']}
+
+ **Document Context:**
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
+
+ **What to Look For:**
+ 1. Check how well the model identified key topics and languages
+ 2. Evaluate the completeness of extracted text
+ 3. Note any systematic errors in text recognition
+ 4. Assess how well document structure was preserved
+ """)
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: Methodological Approaches", key="prev_to_4", on_click=navigate_to_module, args=(4,))
+
+ with col2:
+ st.button("Next: Conclusion", key="next_to_6", on_click=navigate_to_module, args=(6,))
+
+ else: # Module 6
+ # MODULE 6: Conclusion
+ st.title("Module 6: Conclusion and Future Directions")
+
+ col1, col2 = st.columns([3, 2])
+
+ with col1:
+ st.markdown("""
+ ### Workshop Summary
+
+ Throughout this workshop, we've explored:
+
+ 1. **Text-Image Interdependence**: The complex relationship between textual and visual elements
+ 2. **OCR Technology**: The evolution of OCR and its application to historical materials
+ 3. **Methodological Approaches**: Hybrid strategies for working with historical texts
+ 4. **Practical Application**: Hands-on experience with OCR processing tools
+ """)
+
+ st.markdown("""
+ ### Key Takeaways
+
+ 1. **OCR is Not Perfect**: Even advanced AI models face challenges with historical documents
+ 2. **Context Matters**: Vision-enhanced models provide better results by understanding document context
+ 3. **Hybrid Approaches**: Combining computational methods with traditional research yields best results
+ 4. **Critical Evaluation**: Always evaluate OCR outputs with awareness of limitations
+ 5. **Structured Extraction**: Modern OCR goes beyond text recognition to understand document structure
+ """)
+
+ with col2:
+ # Display statistics if there's processing history
+ if st.session_state.processing_history:
+ st.subheader("Your Workshop Statistics")
+
+ # Calculate statistics
+ total_docs = len(st.session_state.processing_history)
+ vision_docs = len([item for item in st.session_state.processing_history if item['useVision']])
+ non_vision_docs = total_docs - vision_docs
+
+ # Create metrics for statistics
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.metric("Documents Processed", total_docs)
+ st.metric("With Vision Model", vision_docs)
+
+ with col2:
+ st.metric("Without Vision Model", non_vision_docs)
+
+ # Topics word cloud
+ if total_docs > 0:
+ st.subheader("Topics Encountered")
+ all_topics = []
+ for item in st.session_state.processing_history:
+ if 'topics' in item['result']:
+ all_topics.extend(item['result']['topics'])
+
+ if all_topics:
+ # Count topic frequencies
+ topic_counts = {}
+ for topic in all_topics:
+ if topic in topic_counts:
+ topic_counts[topic] += 1
+ else:
+ topic_counts[topic] = 1
+
+ # Display as a horizontal bar chart
+ st.bar_chart(topic_counts)
+
+ st.subheader("Future Directions")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("""
+ ### Technological Developments
+
+ - **Multimodal AI models**: Increasingly sophisticated understanding
+ - **Historical font training**: Models trained on historical typography
+ - **Document intelligence**: Enhanced understanding of structures
+ - **Collaborative correction**: Platforms for collective improvement
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Research Applications
+
+ - **Large-scale corpus analysis**: Processing entire archives
+ - **Multilingual historical research**: Working across languages
+ - **Image-text integration**: New methodologies for visual analysis
+ - **Computational paleography**: AI-assisted handwriting analysis
+ """)
+
+ st.markdown("""
+ ### Additional Resources
+
+ - **[Mistral AI Documentation](https://docs.mistral.ai/)**: Learn more about the OCR models used in this workshop
+ - **[Transkribus](https://readcoop.eu/transkribus/)**: Platform for historical document transcription
+ - **[OCR-D](https://ocr-d.de/en/)**: Coordinated OCR research project for historical documents
+ - **[Historical OCR Research Papers](https://scholar.google.com/scholar?q=historical+OCR)**: Academic research on historical OCR
+ """)
+
+ # Reset button to start over
+ if st.button("Start Workshop Again", key="reset_workshop", use_container_width=True):
+ st.session_state.current_module = 1
+ st.rerun()
+
+# Handle API requests if the URL contains /api/process
+if 'api/process' in st.query_params.get('', ''):
+ # Process the API request
+ result = process_api_request()
+ if result:
+ # Return the result as JSON
+ # Make sure result is serializable
+ serializable_result = make_serializable(result)
+ st.json(serializable_result)
+ else:
+ st.json({"error": "Invalid request"})
\ No newline at end of file
diff --git a/streamlit_app.py b/streamlit_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..f941891f42dbc951d73a962a9b7f445e166ef07e
--- /dev/null
+++ b/streamlit_app.py
@@ -0,0 +1,2028 @@
+import os
+import streamlit as st
+import json
+import sys
+from pathlib import Path
+import tempfile
+from datetime import datetime
+import io
+import base64
+from io import BytesIO
+from enum import Enum
+import inspect
+
+# Add parent directory to path so we can import the OCR modules
+parent_dir = Path(__file__).parent.absolute()
+sys.path.append(str(parent_dir))
+
+# Import the StructuredOCR class and process_file utility
+from structured_ocr import StructuredOCR
+
+# Add API endpoint support for the React app
+from streamlit.web.server.server import Server
+from streamlit.runtime.scriptrunner import get_script_run_ctx
+
+# Custom JSON encoder to handle Enum types and other non-serializable objects
+class EnhancedJSONEncoder(json.JSONEncoder):
+ def default(self, obj):
+ if isinstance(obj, Enum):
+ return obj.value
+ elif hasattr(obj, '__dict__'):
+ # For objects that have a __dict__ but aren't directly serializable
+ return {key: value for key, value in obj.__dict__.items()
+ if not key.startswith('_')}
+ elif hasattr(obj, 'model_dump'):
+ # For Pydantic models
+ return obj.model_dump()
+ elif hasattr(obj, 'to_dict'):
+ # For objects with to_dict method
+ return obj.to_dict()
+ # Let the base class handle other types or raise TypeError
+ return super().default(obj)
+
+# Helper function to convert any result to JSON-serializable
+def make_serializable(obj):
+ """Convert any object to a JSON-serializable form"""
+ if isinstance(obj, dict):
+ return {k: make_serializable(v) for k, v in obj.items()}
+ elif isinstance(obj, list):
+ return [make_serializable(item) for item in obj]
+ elif isinstance(obj, Enum):
+ return obj.value
+ elif hasattr(obj, 'pages'):
+ # Special case for OCRResponse objects which have pages attribute
+ if hasattr(obj, '__dict__'):
+ result = {k: make_serializable(v) for k, v in obj.__dict__.items()
+ if not k.startswith('_')}
+ # Explicitly handle pages attribute
+ if hasattr(obj, 'pages'):
+ result['pages'] = [make_serializable(page) for page in obj.pages]
+ return result
+ elif hasattr(obj, '__dict__'):
+ # For objects with __dict__ attribute
+ return {k: make_serializable(v) for k, v in obj.__dict__.items()
+ if not k.startswith('_')}
+ elif hasattr(obj, 'model_dump'):
+ # For Pydantic models
+ return make_serializable(obj.model_dump())
+ elif hasattr(obj, 'to_dict'):
+ # For objects with to_dict method
+ return make_serializable(obj.to_dict())
+ # Basic types will be returned as is
+ return obj
+
+# API response handler
+def process_api_request():
+ """Handle API requests from the React frontend"""
+ # Get the current Streamlit session
+ ctx = get_script_run_ctx()
+ if ctx is None:
+ return
+
+ session_id = ctx.session_id
+ session_info = Server.get_current()._get_session_info(session_id)
+
+ if session_info is None:
+ return
+
+ request = session_info.uploaded_file_mgr._uploaded_files.get('file')
+
+ if not request:
+ return
+
+ # Extract file and parameters
+ uploaded_file = request[0]
+ use_vision = session_info.query_string.get('use_vision', ['true'])[0].lower() == 'true'
+
+ try:
+ # Process file
+ result = process_file(uploaded_file, use_vision=use_vision)
+
+ # Convert result to JSON-serializable format
+ serializable_result = make_serializable(result)
+
+ # Return JSON response
+ return serializable_result
+ except Exception as e:
+ # Return error response
+ return {"error": str(e)}
+
+try:
+ from process_file import process_file as process_file_util
+ # Use the utility function instead of the local function
+ process_file = process_file_util
+except ImportError:
+ # Define the process_file function if it's not available
+ def process_file(uploaded_file, use_vision=True):
+ """Process the uploaded file and return the OCR results"""
+ # Save the uploaded file to a temporary file
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+ tmp.write(uploaded_file.getvalue())
+ temp_path = tmp.name
+
+ try:
+ # Initialize OCR processor
+ processor = StructuredOCR()
+
+ # Determine file type from extension
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ file_type = "pdf" if file_ext == ".pdf" else "image"
+
+ # Process the file
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
+
+ # Add to processing history
+ history_item = {
+ "id": datetime.now().timestamp(),
+ "fileName": uploaded_file.name,
+ "timestamp": datetime.now().isoformat(),
+ "result": result,
+ "useVision": use_vision
+ }
+
+ if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+
+ st.session_state.processing_history.append(history_item)
+
+ return result
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+ return None
+ finally:
+ # Clean up the temporary file
+ if os.path.exists(temp_path):
+ os.unlink(temp_path)
+
+# Set page configuration
+st.set_page_config(
+ page_title="Historical OCR Workshop",
+ page_icon="📜",
+ layout="wide",
+ initial_sidebar_state="collapsed" # Start with sidebar collapsed for cleaner landing
+)
+
+# Custom CSS to match React dark theme and improve Streamlit integration
+st.markdown("""
+
+""", unsafe_allow_html=True)
+
+# Initialize session state for workshop progress
+if 'current_module' not in st.session_state:
+ st.session_state.current_module = 1
+if 'processing_history' not in st.session_state:
+ st.session_state.processing_history = []
+if 'workshop_started' not in st.session_state:
+ st.session_state.workshop_started = False
+
+def navigate_to_module(module_number):
+ """Navigate to a specific module"""
+ st.session_state.current_module = module_number
+
+# Welcome/Start screen if workshop hasn't been started
+if not st.session_state.workshop_started:
+ # Hero section with eye-catching design
+ st.markdown("""
+
+
Historical OCR Workshop
+
Unlock the potential of historical documents with modern OCR technology
+
+ This interactive workshop explores the application of OCR technology to historical documents,
+ combining theoretical understanding with practical experiences. Designed for historians,
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
+
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
+ Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for
+ historical research and digital humanities.
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ # Add an engaging research question to connect with historians
+ st.markdown("""
+
+
For Historians:
+ How might OCR technology transform our access to and interpretation of historical documents?
+ What new research questions become possible when large archives become machine-readable?
+
+ """, unsafe_allow_html=True)
+
+ # Display sample historical document images in a better format
+ input_dir = Path(__file__).parent / "input"
+ sample_images = [
+ {"path": input_dir / "letter-1.jpg", "caption": "Historical Letter"},
+ {"path": input_dir / "recipe.jpg", "caption": "Historical Recipe"}
+ ]
+
+ # Try to find any of the sample images
+ for sample in sample_images:
+ if sample["path"].exists():
+ try:
+ from PIL import Image
+ with Image.open(sample["path"]) as img:
+ # Add a better styled border and shadow
+ st.markdown(f"""
+
+
+ - Processing historical documents with OCR
+ - Analyzing and structuring extracted information
+ - Integrating OCR into research workflows
+ - Building searchable archives
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ # Workshop modules with visually appealing cards
+ st.markdown('
Workshop Modules
', unsafe_allow_html=True)
+
+ # Add some styling for the module cards
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Modules inside the col2 from earlier
+ with col2:
+ st.markdown("""
+
+
Module 1
+
Introduction and Problematization
+
Explore the challenges of historical document digitization and the potential of OCR technologies
+ to transform historical research. Examine key problems and opportunities in historical OCR.
Understand the evolution of OCR technology from pattern matching to modern AI vision-language models,
+ and how they address the unique challenges of historical documents.
Develop hybrid methodologies that combine computational processing with traditional
+ historical research practices, balancing distant and close reading.
Synthesize workshop insights and explore future directions for OCR in historical research,
+ from large-scale corpus analysis to computational paleography.
+
+ """, unsafe_allow_html=True)
+
+ # Engaging quote to inspire participation with citation - in a better styled container
+ st.markdown("""
+
+
"
+
+
The digital turn in historical research is not just about converting analog to digital;
+ it's about transforming how we access, analyze, and interpret the past.
+
+
— Dr. Jane Winters, Professor of Digital Humanities
+
+ """, unsafe_allow_html=True)
+
+ # Feature highlight before call to action with better styling
+ st.markdown("""
+
+
+
+
Workshop Highlights
+
+
+
+
Interactive Learning
+
Hands-on document processing with real-time feedback and analysis
+
+
+
+
Real Historical Documents
+
Work with authentic materials spanning different eras and formats
+
+
+
+
Vision AI Models
+
Experience state-of-the-art OCR technology powered by advanced AI
+
+
+
+
Research Applications
+
Learn to integrate OCR into historical research workflows
+
+
+
+ """, unsafe_allow_html=True)
+
+ # Enhanced start button with dynamic styling and clear call to action
+ st.markdown("""
+
+
Ready to Start Your Journey?
+
+
+
+
No installation required • Start immediately • Interactive experience
+
+
+
+ """, unsafe_allow_html=True)
+
+ # Hidden button to trigger the workshop start
+ col1, col2, col3 = st.columns([1, 1, 1])
+ with col2:
+ if st.button("Begin Workshop", key="streamlit-button", use_container_width=True, type="primary"):
+ st.session_state.workshop_started = True
+ st.rerun()
+
+# Display workshop navigation sidebar only if workshop has started
+elif st.session_state.workshop_started:
+ # Define input directory for images
+ input_dir = Path(__file__).parent / "input"
+
+ # Enhanced sidebar navigation
+ with st.sidebar:
+ st.markdown("
+ """, unsafe_allow_html=True)
+
+ # Hidden button for jump
+ if st.button("Jump to Experiment", key="jump_exp"):
+ navigate_to_module(5)
+ st.rerun()
+
+ # Workshop information in a cleaner collapsible section
+ with st.expander("About the Workshop"):
+ st.markdown("""
+ This interactive workshop explores OCR technology for historical documents.
+
+ **How to use this workshop:**
+ 1. Navigate through modules sequentially
+ 2. Expand content sections to read more
+ 3. Try the interactive OCR experiment
+ 4. Reflect on research questions
+
+ For help or more information, use the reference materials in Module 6.
+ """)
+
+ # Enhanced progress tracking
+ if st.session_state.processing_history:
+ with st.expander("Your Activity"):
+ st.markdown(f"Documents processed: {len(st.session_state.processing_history)}", unsafe_allow_html=True)
+
+ # Show the most recent document processed with better formatting
+ latest = st.session_state.processing_history[-1]
+ st.markdown(f"""
+
+ Latest document: {latest['fileName']}
+ Processed with {' vision model' if latest['useVision'] else ' basic OCR'}
+
+ """, unsafe_allow_html=True)
+
+ # Main content based on current module
+ if st.session_state.current_module == 1:
+ # MODULE 1: Introduction
+ st.title("Module 1: Introduction and Problematization")
+
+ col1, col2 = st.columns([2, 1])
+
+ with col1:
+ st.markdown("""
+ ## Historical OCR Workshop
+
+ ### The Problem
+
+ Historical documents present unique challenges for OCR technology:
+
+ - Varying typography and handwriting styles
+ - Document degradation and damage
+ - Complex layouts and formatting
+ - Multiple languages and archaic terminology
+ - Illustrations and decorative elements
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Workshop Goals
+
+ By the end of this workshop, you will:
+
+ 1. Understand text-image relationships in historical archives
+ 2. Learn about advanced OCR technology
+ 3. Explore methodological approaches
+ 4. Gain hands-on experience with OCR tools
+ 5. Develop research integration strategies
+ """)
+
+ # Next button
+ st.button("Next: Text-Image Relations", key="next_to_2", on_click=navigate_to_module, args=(2,))
+
+ elif st.session_state.current_module == 2:
+ # MODULE 2: Text-Image Relations
+ st.title("Module 2: Text-Image Relations in Historical Archives")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown("""
+ ### Textual Elements
+
+ - **Typography**: Varying fonts, sizes, and styles
+ - **Layout**: Columns, margins, and spacing
+ - **Marginalia**: Notes, comments, and additions
+ - **Decorative Text**: Illuminated letters and calligraphy
+ """)
+
+ st.markdown("""
+ ### Visual Elements
+
+ - **Illustrations**: Diagrams, maps, and artistic representations
+ - **Watermarks**: Hidden identifiers that locate documents
+ - **Damage**: Tears, stains, and fading affecting legibility
+ - **Material Features**: Paper quality and physical dimensions
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Interdependence
+
+ The relationship between text and image in historical documents exists on a complex spectrum:
+
+ - Text functions as image (decorative headings)
+ - Images function as text (symbolic representations)
+ - Layout creates meaning through visual organization
+ - Material conditions affect both textual and visual elements
+ """)
+
+ st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Book_of_Kells_folio_292r.jpg/800px-Book_of_Kells_folio_292r.jpg",
+ caption="Book of Kells - Example of text-image integration")
+
+ st.markdown("""
+ ### OCR Challenges
+
+ These complex text-image relationships create particular challenges for OCR:
+
+ 1. **Distinguishing Text from Decoration**: Where does ornamental text end and functional text begin?
+ 2. **Handling Illustrations**: Should they be processed as images or described as text?
+ 3. **Interpreting Layout**: How to capture the significance of spacing and organization?
+ 4. **Preserving Context**: Maintaining the relationship between textual and visual elements
+ """)
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: Introduction", key="prev_to_1", on_click=navigate_to_module, args=(1,))
+
+ with col2:
+ st.button("Next: OCR Technology", key="next_to_3", on_click=navigate_to_module, args=(3,))
+
+ elif st.session_state.current_module == 3:
+ # MODULE 3: OCR Technology
+ st.title("Module 3: OCR Technology and Historical Documents")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown("""
+ ### Traditional OCR Approaches
+
+ 1. **Pattern Matching**: Early OCR compared characters to templates
+ 2. **Feature Extraction**: Identifying key features of characters
+ 3. **Statistical Models**: Using probabilities to improve recognition
+ """)
+
+ st.markdown("""
+ ### Modern AI-Enhanced OCR
+
+ 1. **Neural Networks**: Deep learning models trained on vast datasets
+ 2. **Computer Vision**: Advanced image processing techniques
+ 3. **Language Models**: Contextual understanding to resolve ambiguities
+ 4. **Multimodal Models**: Integration of text, layout, and visual understanding
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Challenges with Historical Documents
+
+ Historical materials present unique difficulties:
+
+ - **Typography Variation**: Non-standardized fonts and styles
+ - **Historical Language**: Archaic vocabulary and grammar
+ - **Layout Complexity**: Non-linear arrangements
+ - **Document Degradation**: Fading, tears, stains, and damage
+ - **Material Artifacts**: Paper texture, binding shadows, etc.
+ """)
+
+ st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg",
+ caption="OCR processing layers")
+
+ # Display processing history if available
+ if st.session_state.processing_history:
+ with st.expander("Your OCR Processing History"):
+ st.markdown("You've already processed the following documents:")
+
+ for item in st.session_state.processing_history:
+ st.markdown(f"**{item['fileName']}** - {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
+ with col2:
+ st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
+
+ # Quick link to experiment
+ st.button("Jump to OCR Experiment", key="jump_to_5", on_click=navigate_to_module, args=(5,))
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: Text-Image Relations", key="prev_to_2", on_click=navigate_to_module, args=(2,))
+
+ with col2:
+ st.button("Next: Methodological Approaches", key="next_to_4", on_click=navigate_to_module, args=(4,))
+
+ elif st.session_state.current_module == 4:
+ # MODULE 4: Methodological Approaches
+ st.title("Module 4: Methodological Approaches")
+
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown("""
+ ### Hybrid Methodologies
+
+ 1. **Computational + Human Reading**
+ - OCR for initial processing and discovery
+ - Human review for context and interpretation
+ - Iterative refinement of computational outputs
+
+ 2. **Close + Distant Reading**
+ - Distant reading through large-scale OCR processing
+ - Close reading of selected passages
+ - Zooming between scales of analysis
+ """)
+
+ # Reference to diagram.jpg
+ input_dir = Path(__file__).parent / "input"
+ diagram_path = input_dir / "diagram.jpg"
+ if diagram_path.exists():
+ try:
+ # Load image file directly from disk
+ from PIL import Image
+ with Image.open(diagram_path) as img:
+ st.image(img, caption="Historical VLM architecture", use_column_width=True)
+ except Exception:
+ pass
+
+ with col2:
+ st.markdown("""
+ ### Mistral-OCR-Latest: State-of-the-Art
+
+ The Mistral-OCR model represents a significant advancement:
+
+ - **Multimodal Understanding**: Processes both visual and textual information
+ - **Contextual Awareness**: Considers historical context
+ - **Layout Recognition**: Preserves complex document structures
+ - **Historical Font Adaptation**: Trained on diverse historical typography
+ """)
+
+ # Reference to workflow.jpg
+ workflow_path = input_dir / "workflow.jpg"
+ if workflow_path.exists():
+ try:
+ # Load image file directly from disk
+ from PIL import Image
+ with Image.open(workflow_path) as img:
+ st.image(img, caption="Mistral OCR workflow", use_column_width=True)
+ except Exception:
+ pass
+
+ st.markdown("""
+ ### Practical Workflow
+
+ A typical historical OCR workflow with Mistral-OCR includes:
+
+ 1. **Selection**: Choosing appropriate documents
+ 2. **Preprocessing**: Enhancing images before OCR
+ 3. **OCR Processing**: Running documents through vision-enhanced OCR
+ 4. **Post-processing**: Cleaning up outputs and structured extraction
+ 5. **Verification**: Cross-checking results against originals
+ 6. **Integration**: Incorporating OCR outputs into research materials
+ """)
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: OCR Technology", key="prev_to_3", on_click=navigate_to_module, args=(3,))
+
+ with col2:
+ st.button("Next: Interactive OCR Experiment", key="next_to_5", on_click=navigate_to_module, args=(5,))
+
+ elif st.session_state.current_module == 5:
+ # MODULE 5: Interactive OCR Experiment
+ st.title("Module 5: Interactive OCR Experiment")
+
+ # More modular design with sequenced steps
+ st.markdown("""
+
+ This interactive module allows you to process historical documents with OCR and analyze the results.
+ Follow the sequenced steps below to experiment with historical document analysis.
+
+ """, unsafe_allow_html=True)
+
+ # Tabbed interface for different activities
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
+
+ with experiment_tab:
+ # Import additional libraries for enhanced functionality
+ try:
+ from pdf2image import convert_from_bytes
+ pdf_support = True
+ except ImportError:
+ pdf_support = False
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
+
+ # OCR tool in a compact layout
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Step 1: Select Document & Options
", unsafe_allow_html=True)
+
+ # Processing options
+ use_vision = st.checkbox("Use Vision Model", value=True,
+ help="Use the vision model for improved analysis")
+
+ # Additional prompt for the model
+ st.markdown("### Custom Research Prompt (Optional)")
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
+ Focus on specific aspects of historical research you're interested in.""")
+ custom_prompt = st.text_area("Research Prompt",
+ placeholder="E.g., Focus on identifying dates and historical figures; Analyze the writing style for period-specific terminology; Highlight any cultural or social indicators of the time period...",
+ help="Optional instructions to guide the analysis of the historical document")
+
+ # Example preset documents selection
+ input_dir = Path(__file__).parent / "input"
+ if input_dir.exists():
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
+ if sample_files:
+ st.markdown("#### Sample Documents")
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
+ sample_choice = st.selectbox("Choose a document:", sample_options)
+
+ if sample_choice != "Upload my own document":
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
+ if selected_file:
+ # Store the selected sample file in session state
+ with open(selected_file, "rb") as f:
+ file_bytes = f.read()
+ st.session_state.sample_file = {
+ "name": selected_file.name,
+ "bytes": file_bytes
+ }
+
+ # Preview the selected sample
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
+ try:
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ # Convert PIL image to bytes for Streamlit
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+ st.image(img_bytes, caption=f"Preview: {selected_file.name}", use_container_width=True)
+ except Exception:
+ st.info(f"PDF selected: {selected_file.name}")
+ else:
+ # For images display directly
+ try:
+ from PIL import Image
+ img = Image.open(BytesIO(file_bytes))
+ st.image(img, caption=f"Preview: {selected_file.name}", use_container_width=True)
+ except Exception:
+ st.info(f"Selected: {selected_file.name}")
+ else:
+ # Clear the sample file if "Upload my own" is selected
+ if 'sample_file' in st.session_state:
+ del st.session_state.sample_file
+
+ # File uploader with styling matched to React theme
+ st.markdown('
', unsafe_allow_html=True)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is None:
+ st.markdown("### Upload a document to get started")
+ st.markdown("Supported formats: PDF, JPG, PNG")
+ else:
+ # Display the uploaded file
+ file_ext = Path(uploaded_file.name).suffix.lower()
+
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # Convert first page of PDF to image for preview
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+
+ if images:
+ # Convert PIL image to bytes for Streamlit
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+
+ # Display the PDF preview
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ # Simply show the file name without an error message
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+
+ st.markdown('
', unsafe_allow_html=True)
+ else:
+ # No sample files found, just show the uploader
+ st.markdown('
', unsafe_allow_html=True)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is None:
+ st.markdown("### Upload a document to get started")
+ st.markdown("Supported formats: PDF, JPG, PNG")
+ else:
+ # Display the uploaded file preview
+ file_ext = Path(uploaded_file.name).suffix.lower()
+ if file_ext == ".pdf" and pdf_support:
+ try:
+ # PDF preview logic
+ pdf_bytes = uploaded_file.getvalue()
+ with st.spinner("Generating PDF preview..."):
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
+ if images:
+ first_page = images[0]
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ except Exception:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ elif file_ext != ".pdf":
+ st.image(uploaded_file, use_container_width=True)
+ else:
+ st.info(f"PDF uploaded: {uploaded_file.name}")
+ st.markdown('
', unsafe_allow_html=True)
+ else:
+ # Input directory doesn't exist, just show the uploader
+ st.markdown('
', unsafe_allow_html=True)
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
+
+ if uploaded_file is None:
+ st.markdown("### Upload a document to get started")
+ st.markdown("Supported formats: PDF, JPG, PNG")
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Step 2: Process document
+ st.subheader("Step 2: Process the Document")
+
+ # Get the file to process (either uploaded or sample)
+ file_to_process = None
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
+ # Create a FileUploader-like object from the sample file
+ class SampleFileObject:
+ def __init__(self, name, data):
+ self.name = name
+ self._data = data
+
+ def getvalue(self):
+ return self._data
+
+ file_to_process = SampleFileObject(
+ st.session_state.sample_file["name"],
+ st.session_state.sample_file["bytes"]
+ )
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
+ file_to_process = uploaded_file
+
+ # Process button (disabled if no file selected)
+ process_button = st.button(
+ "Process Document",
+ disabled=file_to_process is None,
+ use_container_width=True
+ )
+
+ if process_button and file_to_process is not None:
+ with st.spinner("Processing document..."):
+ try:
+ # Process the file
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
+
+ if result:
+ st.success("Document processed successfully!")
+
+ # Store result in session state for display in the right column
+ st.session_state.current_result = result
+ st.rerun() # Refresh to show result
+ else:
+ st.error("Failed to process document.")
+ except Exception as e:
+ st.error(f"Error processing document: {str(e)}")
+
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Experiment instructions in a compact format
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Experiment Instructions
", unsafe_allow_html=True)
+ st.markdown("""
+ 1. **Step 1:** Select a document and choose your options
+ 2. **Step 2:** Process the document with the selected options
+ 3. **Step 3:** Analyze the results in the panel on the right
+ 4. **Step 4:** Try again with different settings (e.g., toggle vision model)
+ 5. **Step 5:** Compare results between different runs
+ """)
+ st.markdown('
', unsafe_allow_html=True)
+
+ with col2:
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Step 3: View Results
", unsafe_allow_html=True)
+
+ if 'current_result' in st.session_state and st.session_state.current_result:
+ result = st.session_state.current_result
+
+ # File info in a compact format
+ st.markdown(f"**File:** {result.get('file_name', 'Unknown')}")
+
+ # Horizontal display of metadata
+ col1, col2 = st.columns(2)
+ with col1:
+ if 'languages' in result and result['languages']:
+ languages = [lang for lang in result['languages'] if lang]
+ if languages:
+ st.markdown(f"**Languages:** {', '.join(languages)}")
+
+ with col2:
+ if 'topics' in result and result['topics']:
+ st.markdown(f"**Topics:** {', '.join(result['topics'])}")
+
+ # Create tabs for different views with inline styling to ensure visibility
+ tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ with tab1:
+ # Display in a more user-friendly format based on the content structure
+ if 'ocr_contents' in result:
+ if isinstance(result['ocr_contents'], dict):
+ for section, content in result['ocr_contents'].items():
+ if content: # Only display non-empty sections
+ st.markdown(f"
{section.replace('_', ' ').title()}
", unsafe_allow_html=True)
+
+ if isinstance(content, str):
+ st.markdown(f"
{content}
", unsafe_allow_html=True)
+ elif isinstance(content, list):
+ for item in content:
+ if isinstance(item, str):
+ st.markdown(f"
", unsafe_allow_html=True)
+ elif isinstance(content, dict):
+ for k, v in content.items():
+ st.markdown(f"
{k}: {v}
", unsafe_allow_html=True)
+
+ with tab2:
+ # Show the raw JSON for developers
+ # Convert to serializable format first
+ serializable_result = make_serializable(result)
+ st.json(serializable_result)
+
+ # Download options
+ st.markdown("### Export Results")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ # Export as JSON (using the serializable converter)
+ serializable_result = make_serializable(result)
+ json_bytes = json.dumps(serializable_result, indent=2).encode()
+ st.download_button(
+ label="Download JSON",
+ data=json_bytes,
+ file_name="ocr_results.json",
+ mime="application/json",
+ use_container_width=True
+ )
+
+ with col2:
+ # Export as text
+ if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
+ text_content = result['ocr_contents']['content']
+ st.download_button(
+ label="Download Text",
+ data=text_content.encode(),
+ file_name="ocr_text.txt",
+ mime="text/plain",
+ use_container_width=True
+ )
+ else:
+ st.markdown("""
+
+
Results will appear here
+
+
Upload and process a document to see the OCR results in this panel.
+
+
The OCR tool will:
+
+
Extract text from your document
+
Identify languages and topics
+
Provide structured content analysis
+
Generate downloadable results
+
+
+ """, unsafe_allow_html=True)
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Processing history
+ if st.session_state.processing_history:
+ st.markdown('
', unsafe_allow_html=True)
+ st.markdown("
Step 4: Review Processing History
", unsafe_allow_html=True)
+
+ # Most recent result summary
+ latest = st.session_state.processing_history[-1]
+ st.markdown(f"**Latest Document:** {latest['fileName']}")
+ st.markdown(f"**Processed at:** {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ st.markdown(f"**Vision model used:** {'Yes' if latest['useVision'] else 'No'}")
+
+ # Full history in expander
+ with st.expander("View Complete Processing History"):
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
+ st.markdown(f"""
+
+ """, unsafe_allow_html=True)
+
+ # Add option to view a previous result
+ if st.button(f"View This Result", key=f"view_history_{i}"):
+ st.session_state.current_result = item['result']
+ st.rerun()
+ st.markdown('
', unsafe_allow_html=True)
+
+ # Compare tab for side-by-side comparison
+ with compare_tab:
+ st.subheader("Compare OCR Results")
+
+ if len(st.session_state.processing_history) >= 2:
+ st.markdown("""
+ Select two processing results to compare side by side. This allows you to see
+ how different options (like using the vision model) affect OCR quality.
+ """)
+
+ # Create selection dropdowns for the documents
+ col1, col2 = st.columns(2)
+ with col1:
+ # First document selector
+ doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
+ for i, item in enumerate(st.session_state.processing_history)]
+ doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
+ doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
+
+ with col2:
+ # Second document selector
+ doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
+ for i, item in enumerate(st.session_state.processing_history)]
+ default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
+ doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
+ doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
+
+ # Retrieve the selected documents
+ doc1 = st.session_state.processing_history[doc_index_1]
+ doc2 = st.session_state.processing_history[doc_index_2]
+
+ # Show comparison
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown(f"### Document 1: {doc1['fileName']}")
+ st.markdown(f"**Processed at:** {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ st.markdown(f"**Vision model used:** {'Yes' if doc1['useVision'] else 'No'}")
+
+ # Display content summary
+ if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
+ if 'content' in doc1['result']['ocr_contents']:
+ content = doc1['result']['ocr_contents']['content']
+ # Display first 500 characters with word wrap
+ st.markdown(f"""
+
+ {content[:500]}{'...' if len(content) > 500 else ''}
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ st.markdown(f"### Document 2: {doc2['fileName']}")
+ st.markdown(f"**Processed at:** {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}")
+ st.markdown(f"**Vision model used:** {'Yes' if doc2['useVision'] else 'No'}")
+
+ # Display content summary
+ if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
+ if 'content' in doc2['result']['ocr_contents']:
+ content = doc2['result']['ocr_contents']['content']
+ # Display first 500 characters with word wrap
+ st.markdown(f"""
+
+ {content[:500]}{'...' if len(content) > 500 else ''}
+
+ """, unsafe_allow_html=True)
+
+ # Comparison analysis
+ if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
+ st.markdown("""
+
+
Vision vs. Non-Vision Model Comparison
+
You're comparing the same document processed with different models.
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.
Process at least two documents to enable side-by-side comparison. Try processing
+ the same document with and without the vision model to see the differences in OCR quality.
+
+ """, unsafe_allow_html=True)
+
+ # Analysis tab for guidance on working with OCR results
+ with analyze_tab:
+ st.subheader("Analysis Guide")
+
+ st.markdown("""
+
+
How to Analyze OCR Results
+
This guide helps you assess the quality and usefulness of OCR output for historical research.
+
+ """, unsafe_allow_html=True)
+
+ st.markdown("""
+ ### Evaluating OCR Quality
+
+ When analyzing OCR results from historical documents, consider these key factors:
+
+ 1. **Text Accuracy**
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
+ - Assess recognition of period-specific typography and writing styles
+ - Evaluate handling of degraded or damaged text areas
+
+ 2. **Structure Preservation**
+ - Does the OCR maintain paragraph and section breaks?
+ - Are columns and tabular data correctly preserved?
+ - How well are page transitions handled?
+
+ 3. **Special Elements**
+ - Recognition of footnotes, marginalia, and annotations
+ - Handling of illustrations, diagrams, and decorative elements
+ - Treatment of watermarks, signatures, and stamps
+
+ 4. **Metadata Extraction**
+ - Accuracy of detected languages, topics, and document type
+ - Identification of dates, names, and key entities
+ - Recognition of document purpose and context
+ """)
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("""
+ ### Common OCR Challenges in Historical Documents
+
+ - **Typography Variations**: Historical fonts and writing styles that differ from modern text
+ - **Material Degradation**: Fading, stains, tears, and other damage affecting legibility
+ - **Handwritten Elements**: Marginalia, signatures, and handwritten annotations
+ - **Complex Layouts**: Multi-column formats, non-linear reading order, and decorative elements
+ - **Language and Terminology**: Archaic terms, specialized vocabulary, and multilingual content
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Making the Most of OCR Results
+
+ - **Contextual Reading**: Use historical context to interpret unclear passages
+ - **Error Patterns**: Identify and mentally correct for systematic OCR errors
+ - **Hybrid Analysis**: Combine OCR-based search with close reading of original images
+ - **Comparative Processing**: Try different OCR settings and models on the same document
+ - **Iterative Refinement**: Use insights from each document to improve future processing
+ """)
+
+ st.markdown("""
+ ### Research Integration
+
+ Once you've obtained and analyzed OCR results from historical documents, consider these approaches for integrating them into your research:
+
+ 1. **Digital Corpus Building**: Create searchable collections of processed texts
+ 2. **Computational Analysis**: Apply text mining, topic modeling, or network analysis
+ 3. **Cross-Document Linking**: Identify connections across multiple sources
+ 4. **Annotation and Enrichment**: Add context, translations, or explanatory notes
+ 5. **Collaborative Research**: Share processed texts with other researchers
+
+ Remember that OCR is a tool to assist your research, not replace careful reading and analysis. The most effective approaches combine computational methods with traditional historical research practices.
+ """)
+
+ # Example of what to look for
+ if st.session_state.processing_history:
+ with st.expander("Example Analysis from Your Documents"):
+ # Pick the latest document
+ latest = st.session_state.processing_history[-1]
+
+ st.markdown(f"""
+ #### Sample Analysis for: {latest['fileName']}
+
+ **Document Context:**
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
+
+ **What to Look For:**
+ 1. Check how well the model identified key topics and languages
+ 2. Evaluate the completeness of extracted text
+ 3. Note any systematic errors in text recognition
+ 4. Assess how well document structure was preserved
+ """)
+
+ # Navigation buttons
+ col1, col2 = st.columns(2)
+ with col1:
+ st.button("Previous: Methodological Approaches", key="prev_to_4", on_click=navigate_to_module, args=(4,))
+
+ with col2:
+ st.button("Next: Conclusion", key="next_to_6", on_click=navigate_to_module, args=(6,))
+
+ else: # Module 6
+ # MODULE 6: Conclusion
+ st.title("Module 6: Conclusion and Future Directions")
+
+ col1, col2 = st.columns([3, 2])
+
+ with col1:
+ st.markdown("""
+ ### Workshop Summary
+
+ Throughout this workshop, we've explored:
+
+ 1. **Text-Image Interdependence**: The complex relationship between textual and visual elements
+ 2. **OCR Technology**: The evolution of OCR and its application to historical materials
+ 3. **Methodological Approaches**: Hybrid strategies for working with historical texts
+ 4. **Practical Application**: Hands-on experience with OCR processing tools
+ """)
+
+ st.markdown("""
+ ### Key Takeaways
+
+ 1. **OCR is Not Perfect**: Even advanced AI models face challenges with historical documents
+ 2. **Context Matters**: Vision-enhanced models provide better results by understanding document context
+ 3. **Hybrid Approaches**: Combining computational methods with traditional research yields best results
+ 4. **Critical Evaluation**: Always evaluate OCR outputs with awareness of limitations
+ 5. **Structured Extraction**: Modern OCR goes beyond text recognition to understand document structure
+ """)
+
+ with col2:
+ # Display statistics if there's processing history
+ if st.session_state.processing_history:
+ st.subheader("Your Workshop Statistics")
+
+ # Calculate statistics
+ total_docs = len(st.session_state.processing_history)
+ vision_docs = len([item for item in st.session_state.processing_history if item['useVision']])
+ non_vision_docs = total_docs - vision_docs
+
+ # Create metrics for statistics
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.metric("Documents Processed", total_docs)
+ st.metric("With Vision Model", vision_docs)
+
+ with col2:
+ st.metric("Without Vision Model", non_vision_docs)
+
+ # Topics word cloud
+ if total_docs > 0:
+ st.subheader("Topics Encountered")
+ all_topics = []
+ for item in st.session_state.processing_history:
+ if 'topics' in item['result']:
+ all_topics.extend(item['result']['topics'])
+
+ if all_topics:
+ # Count topic frequencies
+ topic_counts = {}
+ for topic in all_topics:
+ if topic in topic_counts:
+ topic_counts[topic] += 1
+ else:
+ topic_counts[topic] = 1
+
+ # Display as a horizontal bar chart
+ st.bar_chart(topic_counts)
+
+ st.subheader("Future Directions")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("""
+ ### Technological Developments
+
+ - **Multimodal AI models**: Increasingly sophisticated understanding
+ - **Historical font training**: Models trained on historical typography
+ - **Document intelligence**: Enhanced understanding of structures
+ - **Collaborative correction**: Platforms for collective improvement
+ """)
+
+ with col2:
+ st.markdown("""
+ ### Research Applications
+
+ - **Large-scale corpus analysis**: Processing entire archives
+ - **Multilingual historical research**: Working across languages
+ - **Image-text integration**: New methodologies for visual analysis
+ - **Computational paleography**: AI-assisted handwriting analysis
+ """)
+
+ st.markdown("""
+ ### Additional Resources
+
+ - **[Mistral AI Documentation](https://docs.mistral.ai/)**: Learn more about the OCR models used in this workshop
+ - **[Transkribus](https://readcoop.eu/transkribus/)**: Platform for historical document transcription
+ - **[OCR-D](https://ocr-d.de/en/)**: Coordinated OCR research project for historical documents
+ - **[Historical OCR Research Papers](https://scholar.google.com/scholar?q=historical+OCR)**: Academic research on historical OCR
+ """)
+
+ # Reset button to start over
+ if st.button("Start Workshop Again", key="reset_workshop", use_container_width=True):
+ st.session_state.current_module = 1
+ st.rerun()
+
+# Handle API requests if the URL contains /api/process
+if 'api/process' in st.query_params.get('', ''):
+ # Process the API request
+ result = process_api_request()
+ if result:
+ # Return the result as JSON
+ # Make sure result is serializable
+ serializable_result = make_serializable(result)
+ st.json(serializable_result)
+ else:
+ st.json({"error": "Invalid request"})
\ No newline at end of file
diff --git a/structured_ocr.py b/structured_ocr.py
index e94e9acef06f53728833bf21f0ac36866cb14cb4..ff80ba6010cd863225a93486561ee5dd12742c51 100644
--- a/structured_ocr.py
+++ b/structured_ocr.py
@@ -1,276 +1,49 @@
-# Standard library imports
import os
import sys
import time
-import random
+from enum import Enum
+from pathlib import Path
import json
import base64
+import pycountry
import logging
-from enum import Enum
-from pathlib import Path
-from functools import lru_cache
-from typing import Optional, Dict, Any, List, Union, Tuple
+from pydantic import BaseModel
+from mistralai import Mistral
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-# Third-party imports
-from pydantic import BaseModel
-
-# Try to import pycountry, provide fallback if not available
-try:
- import pycountry
- PYCOUNTRY_AVAILABLE = True
-except ImportError:
- PYCOUNTRY_AVAILABLE = False
- logger.warning("pycountry module not available - using language code fallback")
-
-# Try to import Mistral AI, provide fallback if not available
-try:
- from mistralai import Mistral
- from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
- from mistralai.models import OCRImageObject
- MISTRAL_AVAILABLE = True
-except ImportError:
- MISTRAL_AVAILABLE = False
- logger.warning("mistralai module not available - OCR functionality will be limited")
-
-# Import our language detection module
-try:
- from utils.helpers.language_detection import LanguageDetector
- LANG_DETECTOR_AVAILABLE = True
-except ImportError:
- LANG_DETECTOR_AVAILABLE = False
- logger.warning("language_detection module not available - using fallback language detection")
# Import utilities for OCR processing
try:
- from utils.image_utils import replace_images_in_markdown, get_combined_markdown
+ from ocr_utils import replace_images_in_markdown, get_combined_markdown
except ImportError:
- # Define minimal fallback functions if module not found
- logger.warning("Could not import utils.image_utils - using minimal fallback functions")
-
+ # Define fallback functions if module not found
def replace_images_in_markdown(markdown_str, images_dict):
- """Minimal fallback implementation of replace_images_in_markdown"""
- import re
- for img_id, base64_str in images_dict.items():
- # Match alt text OR link part, ignore extension
- base_id = img_id.split('.')[0]
- pattern = re.compile(rf"!\[[^\]]*{base_id}[^\]]*\]\([^\)]+\)")
- markdown_str = pattern.sub(f"", markdown_str)
+ for img_name, base64_str in images_dict.items():
+ markdown_str = markdown_str.replace(
+ f"", f""
+ )
return markdown_str
def get_combined_markdown(ocr_response):
- """Minimal fallback implementation of get_combined_markdown"""
markdowns = []
for page in ocr_response.pages:
image_data = {}
- if hasattr(page, "images"):
- for img in page.images:
- if hasattr(img, "id") and hasattr(img, "image_base64"):
- image_data[img.id] = img.image_base64
- page_markdown = page.markdown if hasattr(page, "markdown") else ""
- processed_markdown = replace_images_in_markdown(page_markdown, image_data)
- markdowns.append(processed_markdown)
+ for img in page.images:
+ image_data[img.id] = img.image_base64
+ markdowns.append(replace_images_in_markdown(page.markdown, image_data))
return "\n\n".join(markdowns)
# Import config directly (now local to historical-ocr)
-try:
- from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL, TEST_MODE, IMAGE_PREPROCESSING
-except ImportError:
- # Fallback defaults if config is not available
- import os
- MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
- OCR_MODEL = "mistral-ocr-latest"
- TEXT_MODEL = "mistral-large-latest"
- VISION_MODEL = "mistral-large-latest"
- TEST_MODE = True
- # Default image preprocessing settings if config not available
- IMAGE_PREPROCESSING = {
- "max_size_mb": 8.0,
- # Add basic defaults for preprocessing
- "enhance_contrast": 1.2,
- "denoise": True,
- "compression_quality": 95
- }
- logging.warning("Config module not found. Using environment variables and defaults.")
-
-# Helper function to make OCR objects JSON serializable
-# Removed caching to fix unhashable type error
-def serialize_ocr_response(obj):
- """
- Convert OCR response objects to JSON serializable format
- Optimized for speed and memory usage
- """
- # Fast path: Handle primitive types directly
- if obj is None or isinstance(obj, (str, int, float, bool)):
- return obj
-
- # Handle collections with optimized recursion
- if isinstance(obj, list):
- return [serialize_ocr_response(item) for item in obj]
- elif isinstance(obj, dict):
- return {k: serialize_ocr_response(v) for k, v in obj.items()}
- elif hasattr(obj, '__dict__'):
- # For OCR objects with __dict__ attribute
- result = {}
- for key, value in obj.__dict__.items():
- if key.startswith('_'):
- continue # Skip private attributes
-
- # Fast path for OCRImageObject - most common complex object
- if isinstance(value, OCRImageObject):
- # Get image base64 data for validation
- image_base64 = value.image_base64 if hasattr(value, 'image_base64') else None
-
- # COMPLETELY REWRITTEN validation logic using proven test approach
- # Default to FALSE (treating as text) unless proven to be an image
- is_valid_image = False
-
- # Quick exit conditions
- if not image_base64 or not isinstance(image_base64, str):
- # No data or not a string - not a valid image
- is_valid_image = False
- logging.warning("Invalid image data (not a string)")
-
- # Case 1: Definite image with proper data URL prefix
- elif image_base64.startswith('data:image/'):
- is_valid_image = True
- logging.debug("Valid image with data:image/ prefix")
-
- # Case 2: Markdown image reference, not an actual image
- elif image_base64.startswith(''):
- is_valid_image = False
- logging.warning("Markdown image reference detected")
-
- # Extract the image ID for logging
- try:
- img_id = image_base64.split('![')[1].split('](')[0]
- logging.debug(f"Markdown reference for image: {img_id}")
- except:
- img_id = "unknown"
-
- # Case 3: Needs detailed text content detection
- else:
- # Use the same proven approach as in our tests
- # Take a sample for efficiency
- sample = image_base64[:min(len(image_base64), 1000)]
- sample_lower = sample.lower()
-
- # Check for obvious text features using multiple indicators
- has_spaces = ' ' in sample
- has_newlines = '\n' in sample
- has_punctuation = any(p in sample for p in ',.;:!?"\'()[]{}')
-
- # Check for sentence-like structures
- has_sentences = False
- for i in range(len(sample) - 5):
- if sample[i] in '.!?\n' and i+2 < len(sample) and sample[i+1] == ' ' and sample[i+2].isupper():
- has_sentences = True
- break
-
- # Check for common words with word boundary protection
- common_words = ['the', 'and', 'of', 'to', 'a', 'in', 'is', 'that', 'this', 'for']
- has_common_words = any(f" {word} " in f" {sample_lower} " for word in common_words)
-
- # Count the text indicators
- text_indicators = [has_spaces, has_newlines, has_punctuation, has_sentences, has_common_words]
- text_indicator_count = sum(1 for indicator in text_indicators if indicator)
-
- # Log detailed findings for debugging
- logging.debug(f"Text detection - spaces: {has_spaces}, newlines: {has_newlines}, " +
- f"punctuation: {has_punctuation}, sentences: {has_sentences}, " +
- f"common words: {has_common_words}")
- logging.debug(f"Text indicators found: {text_indicator_count}/5")
-
- # CRITICAL FIX: If we detect 2 or more text indicators, this is TEXT not an image!
- if text_indicator_count >= 2:
- is_valid_image = False
- logging.warning(f"Content identified as TEXT with {text_indicator_count}/5 indicators")
- # Only if we have no clear text indicators AND valid base64 chars, treat as image
- elif all(c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/='
- for c in image_base64[:100]):
- is_valid_image = True
- logging.debug("Valid base64 data with no text indicators")
- else:
- # Default to TEXT for anything else - safer approach
- is_valid_image = False
- logging.warning("No clear image patterns detected - treating as text by default")
-
- # Final validation result with definitive message
- logging.warning(f"FINAL CLASSIFICATION: OCRImageObject content type = {'IMAGE' if is_valid_image else 'TEXT'}")
-
- # Process based on final validation result
- if is_valid_image:
- # Process as image if validation passes
- result[key] = {
- 'id': value.id if hasattr(value, 'id') else None,
- 'image_base64': image_base64
- }
- else:
- # Process as text if validation fails, but properly handle markdown references
- if image_base64 and isinstance(image_base64, str):
- # Special handling for markdown image references
- if image_base64.startswith(''):
- # Extract the image description (alt text) if available
- try:
- # Parse the alt text from 
- alt_text = image_base64.split('![')[1].split('](')[0]
- # Use the alt text or a placeholder if it's just the image name
- if alt_text and not alt_text.endswith('.jpeg') and not alt_text.endswith('.jpg'):
- result[key] = f"[Image: {alt_text}]"
- else:
- # Just note that there's an image without the reference
- result[key] = "[Image]"
- logging.info(f"Converted markdown reference to text placeholder: {result[key]}")
- except:
- # Fallback for parsing errors
- result[key] = "[Image]"
- else:
- # Regular text content
- result[key] = image_base64
- else:
- result[key] = str(value)
- # Handle collections
- elif isinstance(value, list):
- result[key] = [serialize_ocr_response(item) for item in value]
- # Handle nested objects
- elif hasattr(value, '__dict__'):
- result[key] = serialize_ocr_response(value)
- # Handle primitives and other types
- else:
- result[key] = value
- return result
- else:
- return obj
+from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
-# Create language enum for structured output - cache language lookup to avoid repeated processing
-@lru_cache(maxsize=1)
-def get_language_dict():
- if PYCOUNTRY_AVAILABLE:
- return {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
- else:
- # Fallback with basic languages when pycountry is not available
- return {
- "en": "English",
- "es": "Spanish",
- "fr": "French",
- "de": "German",
- "it": "Italian",
- "pt": "Portuguese",
- "ru": "Russian",
- "zh": "Chinese",
- "ja": "Japanese",
- "ar": "Arabic",
- "hi": "Hindi",
- "la": "Latin"
- }
+# Create language enum for structured output
+languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
class LanguageMeta(Enum.__class__):
def __new__(metacls, cls, bases, classdict):
- languages = get_language_dict()
for code, name in languages.items():
classdict[name.upper().replace(' ', '_')] = name
return super().__new__(metacls, cls, bases, classdict)
@@ -287,59 +60,10 @@ class StructuredOCRModel(BaseModel):
class StructuredOCR:
def __init__(self, api_key=None):
"""Initialize the OCR processor with API key"""
- # Set up logger for this class instance
- self.logger = logging.getLogger(__name__)
-
- # Check if we're running in test mode or if Mistral is not available
- self.test_mode = TEST_MODE or not MISTRAL_AVAILABLE
- # Initialize current filename for language detection
- self.current_filename = None
-
- if not MISTRAL_AVAILABLE:
- self.logger.warning("Mistral AI package not available - running in test mode")
- self.api_key = "placeholder_key"
- self.client = None
- return
-
- # Initialize API key - use provided key, or environment var
- if self.test_mode and not api_key:
- self.api_key = "placeholder_key"
- else:
- self.api_key = api_key or MISTRAL_API_KEY
-
- # Ensure we have a valid API key when not in test mode
- if not self.api_key and not self.test_mode:
- raise ValueError("No Mistral API key provided. Please set the MISTRAL_API_KEY environment variable or enable TEST_MODE.")
-
- # Clean the API key by removing any whitespace
- self.api_key = self.api_key.strip()
-
- # Check if API key exists but don't enforce length requirements
- if not self.test_mode and not self.api_key:
- self.logger.warning("Warning: No API key provided")
-
- # Initialize client with the API key
- try:
- self.client = Mistral(api_key=self.api_key)
- # Skip validation to avoid unnecessary API calls
- except Exception as e:
- error_msg = str(e).lower()
- if "unauthorized" in error_msg or "401" in error_msg:
- raise ValueError(f"API key authentication failed. Please check your Mistral API key: {str(e)}")
- else:
- self.logger.warning(f"Failed to initialize Mistral client: {str(e)}")
- self.test_mode = True
- self.client = None
-
- # Initialize language detector
- if LANG_DETECTOR_AVAILABLE:
- self.logger.info("Using statistical language detection module")
- self.language_detector = LanguageDetector()
- else:
- self.logger.warning("External language detection not available - using internal fallback")
- self.language_detector = None
+ self.api_key = api_key or MISTRAL_API_KEY
+ self.client = Mistral(api_key=self.api_key)
- def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None, custom_prompt=None):
+ def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
"""Process a file and return structured OCR results
Args:
@@ -349,7 +73,6 @@ class StructuredOCR:
max_pages: Optional limit on number of pages to process
file_size_mb: Optional file size in MB (used for automatic page limiting)
custom_pages: Optional list of specific page numbers to process
- custom_prompt: Optional instructions for the AI to handle unusual document formatting or specific extraction needs
Returns:
Dictionary with structured OCR results
@@ -357,19 +80,10 @@ class StructuredOCR:
# Convert file_path to Path object if it's a string
file_path = Path(file_path)
- # Store current filename for language detection
- self.current_filename = file_path.name
-
# Auto-detect file type if not provided
if file_type is None:
suffix = file_path.suffix.lower()
file_type = "pdf" if suffix == ".pdf" else "image"
-
- # Check for handwritten document by filename
- filename_lower = file_path.name.lower()
- if "handwritten" in filename_lower or "manuscript" in filename_lower or "letter" in filename_lower:
- logger.info(f"Detected likely handwritten document from filename: {file_path.name}")
- # This will be used during processing to apply handwritten-specific handling
# Get file size if not provided
if file_size_mb is None and file_path.exists():
@@ -406,9 +120,9 @@ class StructuredOCR:
# Read and process the file
if file_type == "pdf":
- result = self._process_pdf(file_path, use_vision, max_pages, custom_pages, custom_prompt)
+ result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
else:
- result = self._process_image(file_path, use_vision, custom_prompt)
+ result = self._process_image(file_path, use_vision)
# Add processing time information
processing_time = time.time() - start_time
@@ -418,689 +132,120 @@ class StructuredOCR:
if 'confidence_score' not in result:
result['confidence_score'] = 0.85 # Default confidence
- # Ensure the entire result is fully JSON serializable by running it through our serializer
- try:
- # First convert to a standard dict if it's not already
- if not isinstance(result, dict):
- result = serialize_ocr_response(result)
-
- # Make a final pass to check for any remaining non-serializable objects
- # Proactively check for OCRImageObject instances to avoid serialization warnings
- def has_ocr_image_objects(obj):
- """Check if object contains any OCRImageObject instances recursively"""
- if isinstance(obj, dict):
- return any(has_ocr_image_objects(v) for v in obj.values())
- elif isinstance(obj, list):
- return any(has_ocr_image_objects(item) for item in obj)
- else:
- return 'OCRImageObject' in str(type(obj))
-
- # Apply serialization preemptively if OCRImageObjects are detected
- if has_ocr_image_objects(result):
- # Quietly apply full serialization before any errors occur
- result = serialize_ocr_response(result)
- else:
- # Test JSON serialization to catch any other issues
- json.dumps(result)
- except TypeError as e:
- # If there's still a serialization error, run the whole result through our serializer
- logger = logging.getLogger("serializer")
- logger.warning(f"JSON serialization error in result: {str(e)}. Applying full serialization.")
- # Use a more robust approach to ensure complete serialization
- try:
- # First attempt with our custom serializer
- result = serialize_ocr_response(result)
- # Test if it's fully serializable now
- json.dumps(result)
- except Exception as inner_e:
- # If still not serializable, convert to a simpler format
- logger.warning(f"Secondary serialization error: {str(inner_e)}. Converting to basic format.")
- # Create a simplified result with just the essential information
- simplified_result = {
- "file_name": result.get("file_name", "unknown"),
- "topics": result.get("topics", ["Document"]),
- "languages": [str(lang) for lang in result.get("languages", ["English"]) if lang is not None],
- "ocr_contents": {
- "raw_text": result.get("ocr_contents", {}).get("raw_text", "Text extraction failed due to serialization error")
- },
- "serialization_error": f"Original result could not be fully serialized: {str(e)}"
- }
- result = simplified_result
-
return result
- def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
- """
- Process a PDF file with OCR - optimized version with smart page handling and memory management
+ def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
+ """Process a PDF file with OCR
Args:
file_path: Path to the PDF file
- use_vision: Whether to use vision model for enhanced analysis
+ use_vision: Whether to use vision model
max_pages: Optional limit on the number of pages to process
custom_pages: Optional list of specific page numbers to process
- custom_prompt: Optional custom prompt for specialized extraction
"""
logger = logging.getLogger("pdf_processor")
logger.info(f"Processing PDF: {file_path}")
- # Track processing time
- start_time = time.time()
-
- # Fast path: Return placeholder if in test mode
- if self.test_mode:
- logger.info("Test mode active, returning placeholder response")
- # Enhanced test mode placeholder that's more realistic
- return {
- "file_name": file_path.name,
- "topics": ["Historical Document", "Literature", "American History"],
- "languages": ["English"],
- "ocr_contents": {
- "title": "Harper's New Monthly Magazine",
- "publication_date": "1855",
- "publisher": "Harper & Brothers, New York",
- "raw_text": "This is a test mode placeholder for Harper's New Monthly Magazine from 1855. The actual document contains articles on literature, politics, science, and culture from mid-19th century America.",
- "content": "The magazine includes various literary pieces, poetry, political commentary, and illustrations typical of 19th century periodicals. Known for publishing works by prominent American authors including Herman Melville and Charles Dickens.",
- "key_figures": ["Herman Melville", "Charles Dickens", "Henry Wadsworth Longfellow"],
- "noted_articles": ["Continued serialization of popular novels", "Commentary on contemporary political events", "Scientific discoveries and technological advancements"]
- },
- "pdf_processing_method": "enhanced_test_mode",
- "total_pages": 12,
- "processed_pages": 3,
- "processing_time": 0.5,
- "confidence_score": 0.9
- }
-
try:
- # PDF processing strategy decision based on file size
- file_size_mb = file_path.stat().st_size / (1024 * 1024)
- logger.info(f"PDF size: {file_size_mb:.2f} MB")
-
- # Always use pdf2image for better control and consistency across all PDF files
- use_pdf2image = True
-
- # First try local PDF processing for better performance and control
- if use_pdf2image:
- try:
- import tempfile
- from pdf2image import convert_from_path
-
- logger.info("Processing PDF using pdf2image for better multi-page handling")
-
- # Convert PDF to images with optimized parameters
- conversion_start = time.time()
-
- # Use consistent DPI for all files to ensure reliable results
- dpi = 200 # Higher quality DPI for all files to ensure better text recognition
-
- # Only convert first page initially to check document type
- pdf_first_page = convert_from_path(file_path, dpi=dpi, first_page=1, last_page=1)
- logger.info(f"First page converted in {time.time() - conversion_start:.2f}s")
-
- # Quick check if PDF has readable content
- if not pdf_first_page:
- logger.warning("PDF conversion produced no images, falling back to API")
- raise Exception("PDF conversion failed to produce images")
-
- # Determine total pages in the document
- # First, try simple estimate from first page conversion
- total_pages = 1
-
- # Try pdf2image info extraction
- try:
- # Try with pdf2image page counting - use simpler parameters
- logger.info("Determining PDF page count...")
- count_start = time.time()
-
- # Use a lightweight approach with multi-threading for faster processing
- pdf_info = convert_from_path(
- file_path,
- dpi=72, # Low DPI just for info
- first_page=1,
- last_page=1,
- size=(100, 100), # Tiny image to save memory
- fmt="jpeg",
- thread_count=4, # Increased thread count for faster processing
- output_file=None
- )
-
- # Extract page count
- if hasattr(pdf_info, 'n_pages'):
- total_pages = pdf_info.n_pages
- elif isinstance(pdf_info, dict) and "Pages" in pdf_info:
- total_pages = int(pdf_info.get("Pages", "1"))
- elif len(pdf_first_page) > 0:
- # Just estimate based on first page - at least we have one
- total_pages = 1
-
- logger.info(f"Page count determined in {time.time() - count_start:.2f}s")
- except Exception as count_error:
- logger.warning(f"Error determining page count: {str(count_error)}. Using default of 1")
- total_pages = 1
-
- logger.info(f"PDF has {total_pages} total pages")
-
- # Determine which pages to process
- pages_to_process = []
-
- # Handle custom page selection if provided
- if custom_pages and any(0 < p <= total_pages for p in custom_pages):
- # Filter valid page numbers
- pages_to_process = [p for p in custom_pages if 0 < p <= total_pages]
- logger.info(f"Processing {len(pages_to_process)} custom-selected pages: {pages_to_process}")
- # Otherwise use max_pages limit if provided
- elif max_pages and max_pages < total_pages:
- pages_to_process = list(range(1, max_pages + 1))
- logger.info(f"Processing first {max_pages} pages of {total_pages} total")
- # Or process all pages if reasonable count
- elif total_pages <= 10:
- pages_to_process = list(range(1, total_pages + 1))
- logger.info(f"Processing all {total_pages} pages")
- # For large documents without limits, process subset of pages
- else:
- # Smart sampling: first page, last page, and some pages in between
- pages_to_process = [1] # Always include first page
-
- if total_pages > 1:
- if total_pages <= 5:
- # For few pages, process all
- pages_to_process = list(range(1, total_pages + 1))
- else:
- # For many pages, sample intelligently
- # Add pages from the middle of the document
- middle = total_pages // 2
- # Add last page if more than 3 pages
- if total_pages > 3:
- pages_to_process.append(total_pages)
- # Add up to 3 pages from middle if document is large
- if total_pages > 5:
- pages_to_process.append(middle)
- if total_pages > 10:
- pages_to_process.append(middle // 2)
- pages_to_process.append(middle + (middle // 2))
-
- # Sort pages for sequential processing
- pages_to_process = sorted(list(set(pages_to_process)))
- logger.info(f"Processing {len(pages_to_process)} sampled pages out of {total_pages} total: {pages_to_process}")
-
- # Convert only the selected pages to minimize memory usage
- selected_images = []
- combined_text = []
- detected_languages = set() # Track detected languages across all pages
-
- # Process pages in larger batches for better efficiency
- batch_size = 5 # Process 5 pages at a time for better throughput
- for i in range(0, len(pages_to_process), batch_size):
- batch_pages = pages_to_process[i:i+batch_size]
- logger.info(f"Converting batch of pages {batch_pages}")
-
- # Convert batch of pages with multi-threading for better performance
- batch_start = time.time()
- batch_images = convert_from_path(
- file_path,
- dpi=dpi,
- first_page=min(batch_pages),
- last_page=max(batch_pages),
- thread_count=4, # Use multi-threading for faster PDF processing
- fmt="jpeg" # Use JPEG format for better compatibility
- )
- logger.info(f"Batch conversion completed in {time.time() - batch_start:.2f}s")
-
- # Map converted images to requested page numbers
- for idx, page_num in enumerate(range(min(batch_pages), max(batch_pages) + 1)):
- if page_num in pages_to_process and idx < len(batch_images):
- if page_num == pages_to_process[0]: # First page to process
- selected_images.append(batch_images[idx])
-
- # Process each page individually
- with tempfile.NamedTemporaryFile(suffix='.jpeg', delete=False) as tmp:
- batch_images[idx].save(tmp.name, format='JPEG')
- # Simple OCR to extract text
- try:
- page_result = self._process_image(Path(tmp.name), False, None)
- if 'ocr_contents' in page_result and 'raw_text' in page_result['ocr_contents']:
- # Add page text to combined text without obvious page markers
- page_text = page_result['ocr_contents']['raw_text']
- combined_text.append(f"{page_text}")
-
- # Collect detected languages from each page
- if 'languages' in page_result:
- for lang in page_result['languages']:
- detected_languages.add(lang)
- except Exception as page_e:
- logger.warning(f"Error processing page {page_num}: {str(page_e)}")
- # Clean up temp file
- import os
- os.unlink(tmp.name)
-
- # If we have processed pages
- if selected_images and combined_text:
- # Save first image to temp file for vision model
- with tempfile.NamedTemporaryFile(suffix='.jpeg', delete=False) as tmp:
- selected_images[0].save(tmp.name, format='JPEG', quality=95)
- first_image_path = tmp.name
-
- # Combine all extracted text
- all_text = "\n\n".join(combined_text)
-
- # For custom prompts, use specialized processing
- if custom_prompt:
- try:
- # Process image with vision model
- result = self._process_image(Path(first_image_path), use_vision, None)
-
- # Enhance with text analysis using combined text from all pages
- enhanced_result = self._extract_structured_data_text_only(all_text, file_path.name, custom_prompt)
-
- # Merge results, keeping images from original result
- for key, value in enhanced_result.items():
- if key not in ('raw_response_data', 'pages_data', 'has_images'):
- result[key] = value
-
- # Update raw text with full document text
- if 'ocr_contents' in result:
- result['ocr_contents']['raw_text'] = all_text
-
- # Add flag to indicate custom prompt was applied
- result['custom_prompt_applied'] = 'text_only'
-
- # Simplified approach - no document type detection
-
- except Exception as e:
- logger.warning(f"Custom prompt processing failed: {str(e)}. Using standard processing.")
- # Fall back to standard processing
- result = self._process_image(Path(first_image_path), use_vision, None)
- if 'ocr_contents' in result:
- result['ocr_contents']['raw_text'] = all_text
- else:
- # Standard processing with combined text
- result = self._process_image(Path(first_image_path), use_vision, None)
- if 'ocr_contents' in result:
- result['ocr_contents']['raw_text'] = all_text
-
- # Merge detected languages if available
- if detected_languages:
- result['languages'] = list(detected_languages)
-
- # Add PDF metadata
- result['file_name'] = file_path.name
- result['pdf_processing_method'] = 'pdf2image_optimized'
- result['total_pages'] = total_pages
- result['processed_pages'] = len(pages_to_process)
- result['pages_processed'] = pages_to_process
-
- # Add processing info
- result['processing_info'] = {
- 'method': 'local_pdf_processing',
- 'dpi': dpi,
- 'pages_sampled': pages_to_process,
- 'processing_time': time.time() - start_time
- }
-
- # Clean up
- os.unlink(first_image_path)
-
- return result
- else:
- logger.warning("No pages successfully processed with pdf2image, falling back to API")
- raise Exception("Failed to process PDF pages locally")
-
- except Exception as pdf2image_error:
- logger.warning(f"Local PDF processing failed, falling back to API: {str(pdf2image_error)}")
- # Fall back to API processing
-
- # API-based PDF processing
- logger.info("Processing PDF via Mistral API")
-
- # Optimize file upload for faster processing
+ # Upload the PDF file
logger.info("Uploading PDF file to Mistral API")
- upload_start = time.time()
-
- # Set appropriate timeout based on file size
- upload_timeout = max(60, min(300, int(file_size_mb * 5))) # 60s to 300s based on size
-
- try:
- # Upload the file (Mistral client doesn't support timeout parameter for upload)
- uploaded_file = self.client.files.upload(
- file={
- "file_name": file_path.stem,
- "content": file_path.read_bytes(),
- },
- purpose="ocr"
- )
-
- logger.info(f"PDF uploaded in {time.time() - upload_start:.2f}s")
-
- # Get a signed URL for the uploaded file
- signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
-
- # Process the PDF with OCR - use adaptive timeout based on file size
- logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
-
- # Adaptive retry strategy based on file size
- max_retries = 3 if file_size_mb < 20 else 2 # Fewer retries for large files
- base_retry_delay = 1 if file_size_mb < 10 else 2 # Longer delays for large files
-
- # Adaptive timeout based on file size
- ocr_timeout_ms = min(180000, max(60000, int(file_size_mb * 3000))) # 60s to 180s
-
- # Try processing with retries
- for retry in range(max_retries):
- try:
- ocr_start = time.time()
- pdf_response = self.client.ocr.process(
- document=DocumentURLChunk(document_url=signed_url.url),
- model=OCR_MODEL,
- include_image_base64=True,
- timeout_ms=ocr_timeout_ms
- )
- logger.info(f"PDF OCR processing completed in {time.time() - ocr_start:.2f}s")
- break # Success, exit retry loop
- except Exception as e:
- error_msg = str(e)
- logger.warning(f"API error on attempt {retry+1}/{max_retries}: {error_msg}")
-
- # Handle errors with optimized retry logic
- error_lower = error_msg.lower()
-
- # Authentication errors - no point in retrying
- if any(term in error_lower for term in ["unauthorized", "401", "403", "authentication"]):
- logger.error("API authentication failed. Check your API key.")
- raise ValueError(f"Authentication failed. Please verify your Mistral API key: {error_msg}")
-
- # Connection or server errors - worth retrying
- elif any(term in error_lower for term in ["connection", "timeout", "520", "server error", "502", "503", "504"]):
- if retry < max_retries - 1:
- # Exponential backoff with jitter for better retry behavior
- wait_time = base_retry_delay * (2 ** retry) * (0.8 + 0.4 * random.random())
- logger.info(f"Connection issue detected. Waiting {wait_time:.1f}s before retry...")
- time.sleep(wait_time)
- else:
- # Last retry failed
- logger.error("Maximum retries reached, API connection error persists.")
- raise ValueError(f"Could not connect to Mistral API after {max_retries} attempts: {error_msg}")
-
- # Rate limit errors - much longer wait
- elif any(term in error_lower for term in ["rate limit", "429", "too many requests", "requests rate limit exceeded"]):
- # Check specifically for token exhaustion vs temporary rate limit
- if "quota" in error_lower or "credit" in error_lower or "subscription" in error_lower:
- logger.error("API quota or credit limit reached. No retry will help.")
- raise ValueError(f"Mistral API quota or credit limit reached. Please check your subscription: {error_msg}")
- elif retry < max_retries - 1:
- wait_time = base_retry_delay * (2 ** retry) * 6.0 # Significantly longer wait for rate limits
- logger.info(f"Rate limit exceeded. Waiting {wait_time:.1f}s before retry...")
- time.sleep(wait_time)
- else:
- logger.error("Maximum retries reached, rate limit error persists.")
- raise ValueError(f"API rate limit exceeded. Please try again later: {error_msg}")
-
- # Misc errors - typically no retry will help
- else:
- if retry < max_retries - 1 and any(term in error_lower for term in ["transient", "temporary"]):
- # Only retry for errors explicitly marked as transient
- wait_time = base_retry_delay * (2 ** retry)
- logger.info(f"Transient error detected. Waiting {wait_time:.1f}s before retry...")
- time.sleep(wait_time)
- else:
- logger.error(f"Unrecoverable API error: {error_msg}")
- raise
-
- # Calculate the number of pages to process
- pages_to_process = pdf_response.pages
- total_pages = len(pdf_response.pages)
- limited_pages = False
-
- logger.info(f"API returned {total_pages} total PDF pages")
-
- # Smart page selection logic for better performance
- if custom_pages:
- # Convert to 0-based indexing and filter valid page numbers
- valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
- if valid_indices:
- pages_to_process = [pdf_response.pages[i] for i in valid_indices]
- limited_pages = True
- logger.info(f"Processing {len(valid_indices)} custom-selected pages")
- # Max pages limit with smart sampling
- elif max_pages and total_pages > max_pages:
- if max_pages == 1:
- # Just first page
- pages_to_process = pages_to_process[:1]
- elif max_pages < 5 and total_pages > 10:
- # For small max_pages on large docs, include first, last, and middle
- indices = [0] # First page
- if max_pages > 1:
- indices.append(total_pages - 1) # Last page
- if max_pages > 2:
- indices.append(total_pages // 2) # Middle page
- # Add more pages up to max_pages if needed
- if max_pages > 3:
- remaining = max_pages - len(indices)
- step = total_pages // (remaining + 1)
- for i in range(1, remaining + 1):
- idx = i * step
- if idx not in indices and 0 <= idx < total_pages:
- indices.append(idx)
- indices.sort()
- pages_to_process = [pdf_response.pages[i] for i in indices]
- else:
- # Default: first max_pages
- pages_to_process = pages_to_process[:max_pages]
-
+ uploaded_file = self.client.files.upload(
+ file={
+ "file_name": file_path.stem,
+ "content": file_path.read_bytes(),
+ },
+ purpose="ocr",
+ )
+
+ # Get a signed URL for the uploaded file
+ signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
+
+ # Process the PDF with OCR
+ logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
+ pdf_response = self.client.ocr.process(
+ document=DocumentURLChunk(document_url=signed_url.url),
+ model=OCR_MODEL,
+ include_image_base64=True
+ )
+
+ # Limit pages if requested
+ pages_to_process = pdf_response.pages
+ total_pages = len(pdf_response.pages)
+ limited_pages = False
+
+ logger.info(f"PDF has {total_pages} total pages")
+
+ # Handle custom page selection if provided
+ if custom_pages:
+ # Convert to 0-based indexing and filter valid page numbers
+ valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
+ if valid_indices:
+ pages_to_process = [pdf_response.pages[i] for i in valid_indices]
limited_pages = True
- logger.info(f"Processing {len(pages_to_process)} pages out of {total_pages} total")
-
- # Directly extract any language information from the OCR response
- detected_languages = set()
-
- # Check if the response has a 'languages' attribute in any form
- # First check direct attributes on the response object
- if hasattr(pdf_response, 'languages') and pdf_response.languages:
- for lang in pdf_response.languages:
- detected_languages.add(str(lang))
- logger.info(f"Found language in OCR response: {lang}")
-
- # Then check if it's in the response as a dictionary format
- elif hasattr(pdf_response, '__dict__'):
- response_dict = pdf_response.__dict__
- if 'languages' in response_dict and response_dict['languages']:
- for lang in response_dict['languages']:
- detected_languages.add(str(lang))
- logger.info(f"Found language in OCR response dict: {lang}")
-
- # Calculate confidence score if available
- try:
- confidence_values = [page.confidence for page in pages_to_process if hasattr(page, 'confidence')]
- confidence_score = sum(confidence_values) / len(confidence_values) if confidence_values else 0.89
- except Exception:
- confidence_score = 0.89 # Improved default
-
- # Merge page content intelligently - include page numbers for better context
- all_markdown = []
- for idx, page in enumerate(pages_to_process):
- # Try to determine actual page number
- if custom_pages and len(custom_pages) == len(pages_to_process):
- page_num = custom_pages[idx]
- else:
- # Estimate page number - may not be accurate with sampling
- page_num = idx + 1
-
- page_markdown = page.markdown if hasattr(page, 'markdown') else ""
- # Add page content without obvious page markers
- if page_markdown.strip():
- all_markdown.append(f"{page_markdown}")
-
- # Collect language information from individual pages if available
- if hasattr(page, 'languages') and page.languages:
- for lang in page.languages:
- detected_languages.add(str(lang))
- logger.info(f"Found language in page {page_num}: {lang}")
-
- # Join all pages with separation
- combined_markdown = "\n\n".join(all_markdown)
-
- # Extract structured data with the appropriate model
- if use_vision:
- # Try to get a good image for vision model
- vision_image = None
-
- # Try first page with images
- for page in pages_to_process:
- if hasattr(page, 'images') and page.images:
- vision_image = page.images[0].image_base64
- break
-
- if vision_image:
- # Use vision model with enhanced prompt
- logger.info(f"Using vision model: {VISION_MODEL}")
- result = self._extract_structured_data_with_vision(
- vision_image, combined_markdown, file_path.name, custom_prompt
- )
- else:
- # Fall back to text-only if no images available
- logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
- result = self._extract_structured_data_text_only(
- combined_markdown, file_path.name, custom_prompt
- )
+ logger.info(f"Processing {len(valid_indices)} custom-selected pages")
+ # Otherwise handle max_pages limit
+ elif max_pages and total_pages > max_pages:
+ pages_to_process = pages_to_process[:max_pages]
+ limited_pages = True
+ logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
+
+ # Calculate average confidence score based on OCR response if available
+ confidence_score = 0.0
+ try:
+ # Some OCR APIs provide confidence scores
+ confidence_values = []
+ for page in pages_to_process:
+ if hasattr(page, 'confidence'):
+ confidence_values.append(page.confidence)
+
+ if confidence_values:
+ confidence_score = sum(confidence_values) / len(confidence_values)
else:
- # Use text-only model as requested
- logger.info(f"Using text-only model as specified: {TEXT_MODEL}")
- result = self._extract_structured_data_text_only(
- combined_markdown, file_path.name, custom_prompt
- )
-
- # If we have detected languages directly from the OCR model, use them
- if detected_languages:
- logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
- result['languages'] = list(detected_languages)
- # Add flag to indicate source of language detection
- result['language_detection_source'] = 'mistral-ocr-latest'
-
- # Add metadata about pages
- if limited_pages:
- result['limited_pages'] = {
- 'processed': len(pages_to_process),
- 'total': total_pages
- }
-
- # Set confidence score from OCR
- result['confidence_score'] = confidence_score
-
- # Add processing method info
- result['pdf_processing_method'] = 'api'
- result['total_pages'] = total_pages
- result['processed_pages'] = len(pages_to_process)
-
- # Store serialized OCR response for rendering
- serialized_response = serialize_ocr_response(pdf_response)
- result['raw_response_data'] = serialized_response
-
- # Check if there are images to include
- has_images = hasattr(pdf_response, 'pages') and any(
- hasattr(page, 'images') and page.images for page in pdf_response.pages
- )
- result['has_images'] = has_images
+ confidence_score = 0.85 # Default if no confidence scores available
+ except:
+ confidence_score = 0.85 # Default fallback
- # Include image data for rendering if available
- if has_images:
- # Prepare pages data with image references
- result['pages_data'] = []
-
- # Get serialized pages - handle different formats
- serialized_pages = None
- try:
- if hasattr(serialized_response, 'pages'):
- serialized_pages = serialized_response.pages
- elif isinstance(serialized_response, dict) and 'pages' in serialized_response:
- serialized_pages = serialized_response.get('pages', [])
- else:
- # No pages found in response
- logger.warning("No pages found in OCR response")
- serialized_pages = []
- except Exception as pages_err:
- logger.warning(f"Error extracting pages from OCR response: {str(pages_err)}")
- serialized_pages = []
-
- # Process each page to extract images
- for page_idx, page in enumerate(serialized_pages):
- try:
- # Skip processing pages not in our selection
- if limited_pages and page_idx >= len(pages_to_process):
- continue
-
- # Extract page data with careful error handling
- markdown = ""
- images = []
-
- # Handle different page formats safely
- if isinstance(page, dict):
- markdown = page.get('markdown', '')
- images = page.get('images', [])
- else:
- # Try attribute access
- if hasattr(page, 'markdown'):
- markdown = page.markdown
- if hasattr(page, 'images'):
- images = page.images
-
- # Create page data record
- page_data = {
- 'page_number': page_idx + 1,
- 'markdown': markdown,
- 'images': []
- }
-
- # Process images with careful error handling
- for img_idx, img in enumerate(images):
- try:
- # Extract image ID and base64 data
- img_id = None
- img_base64 = None
-
- if isinstance(img, dict):
- img_id = img.get('id')
- img_base64 = img.get('image_base64')
- else:
- # Try attribute access
- if hasattr(img, 'id'):
- img_id = img.id
- if hasattr(img, 'image_base64'):
- img_base64 = img.image_base64
-
- # Only add if we have valid image data
- if img_base64 and isinstance(img_base64, str):
- # Ensure ID exists
- safe_id = img_id if img_id else f"img_{page_idx}_{img_idx}"
- page_data['images'].append({
- 'id': safe_id,
- 'image_base64': img_base64
- })
- except Exception as img_err:
- logger.warning(f"Error processing image {img_idx} on page {page_idx+1}: {str(img_err)}")
- continue # Skip this image
-
- # Add page data if it has content
- if page_data['markdown'] or page_data['images']:
- result['pages_data'].append(page_data)
-
- except Exception as page_err:
- logger.warning(f"Error processing page {page_idx+1}: {str(page_err)}")
- continue # Skip this page
-
- # Record final processing time
- total_time = time.time() - start_time
- result['processing_time'] = total_time
- logger.info(f"PDF API processing completed in {total_time:.2f}s")
-
- return result
+ # Combine pages' markdown into a single string
+ all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
+
+ # Extract structured data using the appropriate model
+ if use_vision:
+ # Get base64 of first page for vision model
+ first_page_image = None
+ if pages_to_process and pages_to_process[0].images:
+ first_page_image = pages_to_process[0].images[0].image_base64
+
+ if first_page_image:
+ # Use vision model
+ logger.info(f"Using vision model: {VISION_MODEL}")
+ result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
+ else:
+ # Fall back to text-only model if no image available
+ logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+ else:
+ # Use text-only model
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+
+ # Add page limit info to result if needed
+ if limited_pages:
+ result['limited_pages'] = {
+ 'processed': len(pages_to_process),
+ 'total': total_pages
+ }
- except Exception as api_e:
- logger.error(f"Error in API-based PDF processing: {str(api_e)}")
- # Re-raise to be caught by outer exception handler
- raise
+ # Add confidence score
+ result['confidence_score'] = confidence_score
+ # Store the raw OCR response for image rendering
+ result['raw_response'] = pdf_response
+
+ logger.info(f"PDF processing completed successfully")
+ return result
+
except Exception as e:
- # Log the error and return a helpful error result
logger.error(f"Error processing PDF: {str(e)}")
-
# Return basic result on error
return {
"file_name": file_path.name,
@@ -1111,346 +256,52 @@ class StructuredOCR:
"ocr_contents": {
"error": f"Failed to process PDF: {str(e)}",
"partial_text": "Document could not be fully processed."
- },
- "processing_time": time.time() - start_time
+ }
}
- def _process_image(self, file_path, use_vision=True, custom_prompt=None):
+ def _process_image(self, file_path, use_vision=True):
"""Process an image file with OCR"""
logger = logging.getLogger("image_processor")
logger.info(f"Processing image: {file_path}")
- # Check if we're in test mode
- if self.test_mode:
- # Return a placeholder document response
- return {
- "file_name": file_path.name,
- "topics": ["Document"],
- "languages": ["English"],
- "ocr_contents": {
- "title": "Document",
- "content": "Please set up API key to process documents."
- },
- "processing_time": 0.5,
- "confidence_score": 0.0
- }
-
- # No automatic document type detection - rely on the document type specified in the custom prompt
- # The document type is passed from the UI through the custom prompt in ocr_processing.py
-
try:
- # Check file size
- file_size_mb = file_path.stat().st_size / (1024 * 1024)
- logger.info(f"Original image size: {file_size_mb:.2f} MB")
-
- # Use enhanced preprocessing functions from ocr_utils
- try:
- from preprocessing import preprocess_image
- from utils.file_utils import get_base64_from_bytes
-
- logger.info(f"Applying image preprocessing for OCR")
-
- # Get preprocessing settings from config
- max_size_mb = IMAGE_PREPROCESSING.get("max_size_mb", 8.0)
-
- if file_size_mb > max_size_mb:
- logger.info(f"Image is large ({file_size_mb:.2f} MB), optimizing for API submission")
-
- # Use standard preprocessing - document type will be handled by preprocessing.py
- # based on the options passed from the UI
- base64_data_url = get_base64_from_bytes(
- preprocess_image(file_path.read_bytes(),
- {"document_type": "standard",
- "grayscale": True,
- "denoise": True,
- "contrast": 0})
- )
-
- logger.info(f"Image preprocessing completed successfully")
-
- except (ImportError, AttributeError) as e:
- # Fallback to basic processing if advanced functions not available
- logger.warning(f"Advanced preprocessing not available: {str(e)}. Using basic image processing.")
-
- # If image is larger than 8MB, resize it to reduce API payload size
- if file_size_mb > 8:
- logger.info("Image is large, resizing before API submission")
- try:
- from PIL import Image
- import io
-
- # Open and process the image
- with Image.open(file_path) as img:
- # Convert to RGB if not already (prevents mode errors)
- if img.mode != 'RGB':
- img = img.convert('RGB')
-
- # Calculate new dimensions (maintain aspect ratio)
- # Target around 2000-2500 pixels on longest side for better OCR quality
- width, height = img.size
- max_dimension = max(width, height)
- target_dimension = 2000 # Restored to 2000 for better image quality
-
- if max_dimension > target_dimension:
- scale_factor = target_dimension / max_dimension
- resized_width = int(width * scale_factor)
- resized_height = int(height * scale_factor)
- # Use LANCZOS instead of BILINEAR for better quality
- img = img.resize((resized_width, resized_height), Image.LANCZOS)
-
- # Enhance contrast for better text recognition
- from PIL import ImageEnhance
- enhancer = ImageEnhance.Contrast(img)
- img = enhancer.enhance(1.3)
-
- # Save to bytes with compression
- buffer = io.BytesIO()
- img.save(buffer, format="JPEG", quality=92, optimize=True) # Higher quality for better OCR
- buffer.seek(0)
-
- # Get the base64
- encoded_image = base64.b64encode(buffer.getvalue()).decode()
- base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
-
- # Log the new size
- new_size_mb = len(buffer.getvalue()) / (1024 * 1024)
- logger.info(f"Resized image to {new_size_mb:.2f} MB")
- except ImportError:
- logger.warning("PIL not available for resizing. Using original image.")
- # Use enhanced encoder with proper MIME type detection
- from utils.image_utils import encode_image_for_api
- base64_data_url = encode_image_for_api(file_path)
- except Exception as e:
- logger.warning(f"Image resize failed: {str(e)}. Using original image.")
- # Use enhanced encoder with proper MIME type detection
- from utils.image_utils import encode_image_for_api
- base64_data_url = encode_image_for_api(file_path)
- else:
- # For smaller images, use as-is with proper MIME type
- from utils.image_utils import encode_image_for_api
- base64_data_url = encode_image_for_api(file_path)
- except Exception as e:
- # Fallback to original image if any preprocessing fails
- logger.warning(f"Image preprocessing failed: {str(e)}. Using original image.")
- # Use enhanced encoder with proper MIME type detection
- from utils.image_utils import encode_image_for_api
- base64_data_url = encode_image_for_api(file_path)
+ # Read and encode the image file
+ logger.info("Encoding image for API")
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
# Process the image with OCR
logger.info(f"Processing image with OCR using {OCR_MODEL}")
-
- # Add retry logic with more retries and longer backoff periods for rate limit issues
- max_retries = 2 # Reduced to prevent rate limiting
- retry_delay = 1 # Shorter delay between retries
-
- for retry in range(max_retries):
- try:
- image_response = self.client.ocr.process(
- document=ImageURLChunk(image_url=base64_data_url),
- model=OCR_MODEL,
- include_image_base64=True,
- timeout_ms=45000 # 45 second timeout for better performance
- )
- break # Success, exit retry loop
- except Exception as e:
- error_msg = str(e)
- logger.warning(f"API error on attempt {retry+1}/{max_retries}: {error_msg}")
-
- # Check specific error types to handle them appropriately
- error_lower = error_msg.lower()
-
- # Authentication errors - no point in retrying
- if "unauthorized" in error_lower or "401" in error_lower:
- logger.error("API authentication failed. Check your API key.")
- raise ValueError(f"Authentication failed with API key. Please verify your Mistral API key is correct and active: {error_msg}")
-
- # Connection errors - worth retrying
- elif "connection" in error_lower or "timeout" in error_lower or "520" in error_msg or "server error" in error_lower:
- if retry < max_retries - 1:
- # Wait with shorter delay before retrying
- wait_time = retry_delay * (2 ** retry)
- logger.info(f"Connection issue detected. Waiting {wait_time}s before retry...")
- time.sleep(wait_time)
- else:
- # Last retry failed
- logger.error("Maximum retries reached, API connection error persists.")
- raise ValueError(f"Could not connect to Mistral API after {max_retries} attempts: {error_msg}")
-
- # Rate limit errors
- elif "rate limit" in error_lower or "429" in error_lower or "requests rate limit exceeded" in error_lower:
- # Check specifically for token exhaustion vs temporary rate limit
- if "quota" in error_lower or "credit" in error_lower or "subscription" in error_lower:
- logger.error("API quota or credit limit reached. No retry will help.")
- raise ValueError(f"Mistral API quota or credit limit reached. Please check your subscription: {error_msg}")
- elif retry < max_retries - 1:
- # More aggressive backoff for rate limits
- wait_time = retry_delay * (2 ** retry) * 5 # 5x longer wait for rate limits
- logger.info(f"Rate limit exceeded. Waiting {wait_time}s before retry...")
- time.sleep(wait_time)
- else:
- # Last retry failed, try local OCR as fallback
- logger.error("Maximum retries reached, rate limit error persists.")
- try:
- # Try to import the local OCR fallback function
- from utils.image_utils import try_local_ocr_fallback
-
- # Attempt local OCR fallback
- ocr_text = try_local_ocr_fallback(file_path, base64_data_url)
-
- if ocr_text:
- logger.info("Successfully used local OCR fallback")
- # Return a basic result with the local OCR text
- return {
- "file_name": file_path.name,
- "topics": ["Document"],
- "languages": ["English"],
- "ocr_contents": {
- "title": "Document (Local OCR)",
- "content": "This document was processed with local OCR due to API rate limiting.",
- "raw_text": ocr_text
- },
- "processing_method": "local_fallback",
- "processing_note": "Used local OCR due to API rate limit"
- }
- except (ImportError, Exception) as local_err:
- logger.warning(f"Local OCR fallback failed: {str(local_err)}")
-
- # If we get here, both API and local OCR failed
- raise ValueError(f"Mistral API rate limit exceeded. Please try again later: {error_msg}")
-
- # Other errors - no retry
- else:
- logger.error(f"Unrecoverable API error: {error_msg}")
- raise
+ image_response = self.client.ocr.process(
+ document=ImageURLChunk(image_url=base64_data_url),
+ model=OCR_MODEL,
+ include_image_base64=True
+ )
# Get the OCR markdown from the first page
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
- # Check if the OCR response has images
- has_images = hasattr(image_response, 'pages') and image_response.pages and hasattr(image_response.pages[0], 'images') and image_response.pages[0].images
-
- # Check for language information directly from the OCR model
- detected_languages = set()
-
- # Check if the response has a 'languages' attribute in any form
- # First check direct attributes on the response object
- if hasattr(image_response, 'languages') and image_response.languages:
- for lang in image_response.languages:
- detected_languages.add(str(lang))
- logger.info(f"Found language in OCR response: {lang}")
-
- # Then check if it's in the response as a dictionary format
- elif hasattr(image_response, '__dict__'):
- response_dict = image_response.__dict__
- if 'languages' in response_dict and response_dict['languages']:
- for lang in response_dict['languages']:
- detected_languages.add(str(lang))
- logger.info(f"Found language in OCR response dict: {lang}")
-
- # Check for languages in individual pages
- if hasattr(image_response, 'pages') and image_response.pages:
- for page in image_response.pages:
- if hasattr(page, 'languages') and page.languages:
- for lang in page.languages:
- detected_languages.add(str(lang))
- logger.info(f"Found language in page: {lang}")
-
- # Optimize: Skip vision model step if ocr_markdown is very small or empty
- # BUT make an exception if custom_prompt is provided
- # OR if the image has visual content worth preserving
- if (not custom_prompt and not has_images) and (not image_ocr_markdown or len(image_ocr_markdown) < 50):
- logger.warning("OCR produced minimal text with no images. Returning basic result.")
- return {
- "file_name": file_path.name,
- "topics": ["Document"],
- "languages": ["English"],
- "ocr_contents": {
- "raw_text": image_ocr_markdown if image_ocr_markdown else "No text could be extracted from the image."
- },
- "processing_note": "OCR produced minimal text content",
- # Include raw response data for images
- "raw_response_data": serialize_ocr_response(image_response)
- }
-
- # For images with minimal text but visual content, enhance the prompt
- elif has_images and (not image_ocr_markdown or len(image_ocr_markdown) < 100):
- logger.info("Document with images but minimal text detected. Using enhanced prompt for mixed media.")
- if not custom_prompt:
- custom_prompt = "This is a mixed media document with both text and important visual elements. Please carefully describe the image content and extract all visible text, preserving the relationship between text and visuals."
- elif "visual" not in custom_prompt.lower() and "image" not in custom_prompt.lower():
- custom_prompt += " The document contains important visual elements that should be described along with the text content."
+ # Calculate confidence score if available
+ confidence_score = 0.85 # Default value
+ try:
+ if hasattr(image_response.pages[0], 'confidence'):
+ confidence_score = image_response.pages[0].confidence
+ except:
+ pass
- # Extract structured data using the appropriate model, with a single API call
+ # Extract structured data using the appropriate model
if use_vision:
logger.info(f"Using vision model: {VISION_MODEL}")
- result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name, custom_prompt)
+ result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
else:
logger.info(f"Using text-only model: {TEXT_MODEL}")
- result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name, custom_prompt)
-
- # If we have detected languages directly from the OCR model, use them
- if detected_languages:
- logger.info(f"Using languages detected by OCR model: {', '.join(detected_languages)}")
- result['languages'] = list(detected_languages)
- # Add flag to indicate source of language detection
- result['language_detection_source'] = 'mistral-ocr-latest'
-
- # Store the serialized OCR response for image rendering (for compatibility with original version)
- # Don't store raw_response directly as it's not JSON serializable
- serialized_response = serialize_ocr_response(image_response)
- result['raw_response_data'] = serialized_response
-
- # Store key parts of the OCR response for image rendering
- # With serialized format that can be stored in JSON
- result['has_images'] = has_images
-
- if has_images:
- # Serialize the entire response to ensure it's JSON serializable
- serialized_response = serialize_ocr_response(image_response)
+ result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
- # Create a structured representation of images that can be serialized
- result['pages_data'] = []
+ # Add confidence score
+ result['confidence_score'] = confidence_score
- if hasattr(serialized_response, 'pages'):
- serialized_pages = serialized_response.pages
- else:
- # Handle case where serialization returns a dict instead of an object
- serialized_pages = serialized_response.get('pages', [])
-
- for page_idx, page in enumerate(serialized_pages):
- # Handle both object and dict forms
- if isinstance(page, dict):
- markdown = page.get('markdown', '')
- images = page.get('images', [])
- else:
- markdown = page.markdown if hasattr(page, 'markdown') else ''
- images = page.images if hasattr(page, 'images') else []
-
- page_data = {
- 'page_number': page_idx + 1,
- 'markdown': markdown,
- 'images': []
- }
-
- # Extract images if present
- for img_idx, img in enumerate(images):
- img_id = None
- img_base64 = None
-
- if isinstance(img, dict):
- img_id = img.get('id')
- img_base64 = img.get('image_base64')
- else:
- img_id = img.id if hasattr(img, 'id') else None
- img_base64 = img.image_base64 if hasattr(img, 'image_base64') else None
-
- if img_base64:
- page_data['images'].append({
- 'id': img_id if img_id else f"img_{page_idx}_{img_idx}",
- 'image_base64': img_base64
- })
-
- result['pages_data'].append(page_data)
+ # Store the raw OCR response for image rendering
+ result['raw_response'] = image_response
logger.info("Image processing completed successfully")
return result
@@ -1462,6 +313,7 @@ class StructuredOCR:
"file_name": file_path.name,
"topics": ["Document"],
"languages": ["English"],
+ "confidence_score": 0.0,
"error": str(e),
"ocr_contents": {
"error": f"Failed to process image: {str(e)}",
@@ -1469,198 +321,29 @@ class StructuredOCR:
}
}
- def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename, custom_prompt=None):
- """
- Extract structured data using vision model with detailed historical context prompting
- Optimized for speed, accuracy, and resilience
- """
- logger = logging.getLogger("vision_processor")
-
+ def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
+ """Extract structured data using vision model"""
try:
- # Check if this is a newspaper or document with columns by filename
- is_likely_newspaper = False
- newspaper_keywords = ["newspaper", "gazette", "herald", "times", "journal",
- "chronicle", "post", "tribune", "news", "press", "gender"]
-
- # Check filename for newspaper indicators
- filename_lower = filename.lower()
- for keyword in newspaper_keywords:
- if keyword in filename_lower:
- is_likely_newspaper = True
- logger.info(f"Likely newspaper document detected in vision processing: {filename}")
- break
-
- # Fast path: Skip vision API if OCR already produced reasonable text
- # We'll define "reasonable" as having at least 300 characters
- if len(ocr_markdown.strip()) > 300:
- logger.info("Sufficient OCR text detected, analyzing language before using OCR text directly")
-
- # Perform language detection on the OCR text before returning
- if LANG_DETECTOR_AVAILABLE and self.language_detector:
- detected_languages = self.language_detector.detect_languages(
- ocr_markdown,
- filename=getattr(self, 'current_filename', None)
- )
- else:
- # If language detector is not available, use default English
- detected_languages = ["English"]
-
- return {
- "file_name": filename,
- "topics": ["Document"],
- "languages": detected_languages,
- "ocr_contents": {
- "raw_text": ocr_markdown
- }
- }
-
- # Only use vision model for minimal OCR text or when document has columns
- if is_likely_newspaper and (not ocr_markdown or len(ocr_markdown.strip()) < 300):
- logger.info("Using vision model for newspaper with minimal OCR text")
- if not custom_prompt:
- custom_prompt = "Document has columns. Extract text by reading each column top to bottom."
-
- # Fast path: Skip if in test mode or no API key
- if self.test_mode or not self.api_key:
- logger.info("Test mode or no API key, using text-only processing")
- return self._extract_structured_data_text_only(ocr_markdown, filename)
-
- # Use only the first part of OCR text to keep prompts small and processing fast
- if len(ocr_markdown) > 1000:
- truncated_ocr = ocr_markdown[:1000]
- logger.info(f"Truncated OCR text from {len(ocr_markdown)} to 1000 chars for faster processing")
- else:
- truncated_ocr = ocr_markdown
-
- # Build a comprehensive prompt with OCR text and detailed instructions for title detection and language handling
- enhanced_prompt = f"This is a document's OCR text:\n\n{truncated_ocr}\n\n\n"
-
- # Add custom prompt if provided
- if custom_prompt:
- enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
-
- # Primary focus on document structure and title detection
- enhanced_prompt += "You are analyzing a historical document. Follow these extraction priorities:\n"
- enhanced_prompt += "1. FIRST PRIORITY: Identify and extract the TITLE of the document. Look for large text at the top, decorative typography, or centered text that appears to be a title. The title is often one of the first elements in historical documents.\n"
- enhanced_prompt += "2. SECOND: Extract all text content accurately from this document, including any text visible in the image that may not have been captured by OCR.\n\n"
- enhanced_prompt += "Document Title Guidelines:\n"
- enhanced_prompt += "- For printed historical works: Look for primary heading at top of the document, all-caps text, or larger font size text\n"
- enhanced_prompt += "- For newspapers/periodicals: Extract both newspaper name and article title if present\n"
- enhanced_prompt += "- For handwritten documents: Look for centered text at the top or underlined headings\n"
- enhanced_prompt += "- For engravings/illustrations: Include the title or caption, which often appears below the image\n\n"
-
- # Language detection guidance
- enhanced_prompt += "IMPORTANT: After extracting the title and text content, determine the languages present.\n"
- enhanced_prompt += "Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
- enhanced_prompt += "For language detection, examine these specific indicators:\n"
- enhanced_prompt += "- French: accents (é, è, ê, à, ç, â, î, ô, û), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'dans', 'ce', 'cette', 'ces', 'par', 'pour', 'qui', 'que', 'où', 'avec'\n"
- enhanced_prompt += "- Portuguese: accents (ã, õ, á, é, ê, ó, ç), words like 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com'\n"
- enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con'\n"
- enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
- enhanced_prompt += "- Italian: accents (à, è, é, ì, ò, ù), words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
- enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
- enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n\n"
- enhanced_prompt += "If the document contains multiple columns or sections, process each section independently and then combine them logically.\n"
- enhanced_prompt += "Return ALL detected languages as separate entries in the languages array, never combine them.\n"
- enhanced_prompt += "CRITICAL: Do NOT default to English unless absolutely certain. If you see French characteristics like 'é', 'è', 'ê', 'ç' or French words, prioritize French in your language detection."
-
- # Measure API call time for optimization feedback
- start_time = time.time()
-
- try:
- # Use a fixed, shorter timeout for single-page documents
- timeout_ms = 45000 # 45 seconds is optimal for most single-page documents
-
- logger.info(f"Calling vision model with {timeout_ms}ms timeout")
- chat_response = self.client.chat.parse(
- model=VISION_MODEL,
- messages=[
- {
- "role": "user",
- "content": [
- ImageURLChunk(image_url=image_base64),
- TextChunk(text=enhanced_prompt)
- ],
- },
- ],
- response_format=StructuredOCRModel,
- temperature=0,
- timeout_ms=timeout_ms
- )
-
- api_time = time.time() - start_time
- logger.info(f"Vision model completed in {api_time:.2f}s")
-
- except Exception as e:
- # If there's an error with the enhanced prompt, try progressively simpler approaches
- logger.warning(f"Enhanced prompt failed after {time.time() - start_time:.2f}s: {str(e)}")
-
- # Try a very simplified approach with minimal context
- try:
- # Ultra-short prompt for faster processing
- simplified_prompt = (
- f"Extract text from this document image. "
- f"\n{truncated_ocr[:500]}\n\n"
- f"Return a JSON with file_name, topics, languages, and ocr_contents fields."
- )
-
- # Only add minimal custom prompt if provided
- if custom_prompt and len(custom_prompt) < 100:
- simplified_prompt += f"\n{custom_prompt}"
-
- logger.info(f"Trying simplified prompt approach")
- chat_response = self.client.chat.parse(
- model=VISION_MODEL,
- messages=[
- {
- "role": "user",
- "content": [
- ImageURLChunk(image_url=image_base64),
- TextChunk(text=simplified_prompt)
- ],
- },
+ # Parse with vision model with a timeout
+ chat_response = self.client.chat.parse(
+ model=VISION_MODEL,
+ messages=[
+ {
+ "role": "user",
+ "content": [
+ ImageURLChunk(image_url=image_base64),
+ TextChunk(text=(
+ f"This is a historical document's OCR in markdown:\n"
+ f"\n{ocr_markdown}\n.\n"
+ f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
+ f"Extract topics, languages, and organize the content logically."
+ ))
],
- response_format=StructuredOCRModel,
- temperature=0,
- timeout_ms=30000 # Very short timeout for simplified approach (30 seconds)
- )
-
- logger.info(f"Simplified prompt approach succeeded")
-
- except Exception as second_e:
- # If that fails, try with minimal prompt and just image analysis
- logger.warning(f"Simplified prompt failed: {str(second_e)}. Trying minimal prompt.")
-
- try:
- # Minimal prompt focusing only on OCR task
- minimal_prompt = (
- f"Extract the text from this image. "
- f"Return JSON with file_name, topics, languages, and ocr_contents.raw_text fields."
- )
-
- logger.info(f"Trying minimal prompt with image-only focus")
- chat_response = self.client.chat.parse(
- model=VISION_MODEL,
- messages=[
- {
- "role": "user",
- "content": [
- ImageURLChunk(image_url=image_base64),
- TextChunk(text=minimal_prompt)
- ],
- },
- ],
- response_format=StructuredOCRModel,
- temperature=0,
- timeout_ms=25000 # Minimal timeout for last attempt (25 seconds)
- )
-
- logger.info(f"Minimal prompt approach succeeded")
-
- except Exception as third_e:
- # If all vision attempts fail, fall back to text-only model
- logger.warning(f"All vision model attempts failed, falling back to text-only model: {str(third_e)}")
- return self._extract_structured_data_text_only(ocr_markdown, filename)
+ },
+ ],
+ response_format=StructuredOCRModel,
+ temperature=0
+ )
# Convert the response to a dictionary
result = json.loads(chat_response.choices[0].message.parsed.json())
@@ -1668,314 +351,51 @@ class StructuredOCR:
# Ensure languages is a list of strings, not Language enum objects
if 'languages' in result:
result['languages'] = [str(lang) for lang in result.get('languages', [])]
-
- # Add simplified metadata about processing
- result['processing_info'] = {
- 'method': 'vision_model',
- 'ocr_text_length': len(ocr_markdown),
- 'api_response_time': time.time() - start_time
- }
-
- # Note if custom prompt was applied
- if custom_prompt:
- result['custom_prompt_applied'] = 'vision_model'
- # Add confidence score if not present
- if 'confidence_score' not in result:
- result['confidence_score'] = 0.92 # Vision model typically has higher confidence
-
- # If OCR text has clear French patterns but language is English or missing, fix it
- if ocr_markdown and 'languages' in result:
- if LANG_DETECTOR_AVAILABLE and self.language_detector:
- result['languages'] = self.language_detector.detect_languages(
- ocr_markdown,
- filename=getattr(self, 'current_filename', None),
- current_languages=result['languages']
- )
-
except Exception as e:
# Fall back to text-only model if vision model fails
- logger.warning(f"Vision model processing failed, falling back to text-only model: {str(e)}")
+ print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
result = self._extract_structured_data_text_only(ocr_markdown, filename)
return result
-
- # We've removed document type detection entirely for simplicity
-
-
- # Create a prompt with enhanced language detection instructions
- generic_section = (
- f"You are an OCR specialist processing historical documents. "
- f"Focus on accurately extracting text content and image chunks while preserving structure and formatting. "
- f"Pay attention to any historical features and document characteristics.\n\n"
- f"Create a structured JSON response with the following fields:\n"
- f"- file_name: The document's name\n"
- f"- topics: An array of topics covered in the document\n"
- f"- languages: An array of languages used in the document (be precise and specific about language detection)\n"
- f"- ocr_contents: A comprehensive dictionary with the document's contents including:\n"
- f" * title: The title or heading (if present)\n"
- f" * transcript: The full text of the document\n"
- f" * text: The main text content (if different from transcript)\n"
- f" * content: The body content (if different than transcript)\n"
- f" * images: An array of image objects with their base64 data\n"
- f" * alt_text: The alt text or description of the images\n"
- f" * caption: The caption or title of the images\n"
- f" * raw_text: The complete OCR text\n"
- )
-
- # Add custom prompt if provided
- custom_section = ""
- if custom_prompt:
- custom_section = f"\n\nUser-provided instructions: {custom_prompt}\n"
-
- # Return the enhanced prompt
- return generic_section + custom_section
-
- def _extract_structured_data_text_only(self, ocr_markdown, filename, custom_prompt=None):
- """
- Extract structured data using text-only model with detailed historical context prompting
- and improved error handling with enhanced language detection
- """
- logger = logging.getLogger("text_processor")
- start_time = time.time()
-
+
+ def _extract_structured_data_text_only(self, ocr_markdown, filename):
+ """Extract structured data using text-only model"""
try:
- # Fast path: Skip for minimal OCR text
- if not ocr_markdown or len(ocr_markdown.strip()) < 50:
- logger.info("Minimal OCR text - returning basic result")
-
- # Attempt comprehensive language detection even for minimal text
- detected_languages = []
-
- # Simple language detection based on character frequency
- if ocr_markdown and len(ocr_markdown.strip()) > 10:
- # Define indicators for all supported languages
- language_indicators = {
- "Portuguese": {
- "chars": ['ã', 'õ', 'á', 'é', 'ê', 'í', 'ó', 'ú', 'ç'],
- "words": ['e', 'o', 'de', 'da', 'do', 'em', 'para', 'que', 'não', 'com']
- },
- "Spanish": {
- "chars": ['ñ', 'á', 'é', 'í', 'ó', 'ú', '¿', '¡'],
- "words": ['el', 'la', 'los', 'las', 'y', 'en', 'por', 'que', 'con', 'del']
- },
- "French": {
- "chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û'],
- "words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette', 'qui', 'que', 'pour', 'dans', 'par', 'sur']
- },
- "German": {
- "chars": ['ä', 'ö', 'ü', 'ß'],
- "words": ['der', 'die', 'das', 'und', 'ist', 'von', 'mit', 'für', 'sich']
- },
- "Italian": {
- "chars": ['à', 'è', 'é', 'ì', 'ò', 'ù'],
- "words": ['il', 'la', 'e', 'di', 'che', 'per', 'con', 'sono', 'non']
- },
- "Latin": {
- "chars": [],
- "words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod']
- }
- }
-
- words = ocr_markdown.lower().split()
-
- # Check for indicators of each language
- for language, indicators in language_indicators.items():
- chars = indicators["chars"]
- lang_words = indicators["words"]
-
- has_chars = any(char in ocr_markdown for char in chars) if chars else False
- word_count = sum(1 for word in words if word in lang_words)
-
- # Add language if strong enough indicators are present
- if has_chars or word_count >= 2:
- detected_languages.append(language)
-
- # Check for English separately
- english_words = ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it']
- english_count = sum(1 for word in words if word in english_words)
- if english_count >= 2:
- detected_languages.append("English")
-
- # If no languages detected, default to English
- if not detected_languages:
- detected_languages = ["English"]
-
- return {
- "file_name": filename,
- "topics": ["Document"],
- "languages": detected_languages,
- "ocr_contents": {
- "raw_text": ocr_markdown if ocr_markdown else "No text could be extracted"
- },
- "processing_method": "minimal_text"
- }
-
- # Check for API key to avoid unnecessary processing
- if self.test_mode or not self.api_key:
- logger.info("Test mode or no API key - returning basic result")
- return {
- "file_name": filename,
- "topics": ["Document"],
- "languages": ["English"],
- "ocr_contents": {
- "raw_text": ocr_markdown[:10000] if ocr_markdown else "No text could be extracted",
- "note": "API key not provided - showing raw OCR text only"
+ # Parse with text-only model with a timeout
+ chat_response = self.client.chat.parse(
+ model=TEXT_MODEL,
+ messages=[
+ {
+ "role": "user",
+ "content": f"This is a historical document's OCR in markdown:\n"
+ f"\n{ocr_markdown}\n.\n"
+ f"Convert this into a structured JSON response with the OCR contents. "
+ f"Extract topics, languages, and organize the content logically."
},
- "processing_method": "test_mode"
- }
-
- # If OCR text is very large, truncate it to avoid API limits
- truncated_text = ocr_markdown
- if len(ocr_markdown) > 25000:
- # Keep first 15000 chars and last 5000 chars
- truncated_text = ocr_markdown[:15000] + "\n...[content truncated]...\n" + ocr_markdown[-5000:]
- logger.info(f"OCR text truncated from {len(ocr_markdown)} to {len(truncated_text)} chars")
-
- # Build a prompt with enhanced title detection and language detection instructions
- enhanced_prompt = f"This is a document's OCR text:\n\n{truncated_text}\n\n\n"
+ ],
+ response_format=StructuredOCRModel,
+ temperature=0
+ )
- # Add custom prompt if provided
- if custom_prompt:
- enhanced_prompt += f"User instructions: {custom_prompt}\n\n"
-
- # Add title detection focus
- enhanced_prompt += "You are analyzing a historical document. Please follow these extraction priorities:\n"
- enhanced_prompt += "1. FIRST PRIORITY: Identify and extract the TITLE of the document. Look for prominent text at the top, decorative typography, or centered text that appears to be a title.\n"
- enhanced_prompt += " - For historical documents with prominent headings at the top\n"
- enhanced_prompt += " - For newspapers or periodicals, extract both the publication name and article title\n"
- enhanced_prompt += " - For manuscripts or letters, identify any heading or subject line\n"
- enhanced_prompt += "2. SECOND PRIORITY: Extract all text content accurately and return structured data with the document's contents.\n\n"
- enhanced_prompt += "IMPORTANT: Precisely identify and list ALL languages present in the document separately. Look closely for multiple languages that might appear together.\n"
- enhanced_prompt += "For language detection, examine these specific indicators:\n"
- enhanced_prompt += "- French: accents (é, è, ê, à, ç), words like 'le', 'la', 'les', 'et', 'en', 'de', 'du'\n"
- enhanced_prompt += "- German: umlauts (ä, ö, ü), sharp s (ß), words like 'und', 'der', 'die', 'das', 'in', 'mit'\n"
- enhanced_prompt += "- Spanish: ñ, inverted punctuation (¿, ¡), accents (á, é, í, ó, ú), words like 'el', 'la', 'los', 'las', 'y', 'en'\n"
- enhanced_prompt += "- Italian: words like 'il', 'la', 'e', 'di', 'che', 'per', 'con'\n"
- enhanced_prompt += "- Chinese: hanzi characters (汉字), lack of spaces between words, markers like 的, 是, 了, 在, 和, 有\n"
- enhanced_prompt += "- Latin: words like 'et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed'\n"
- enhanced_prompt += "Do NOT classify text as English unless you can positively confirm it contains specifically English words and phrases.\n\n"
- enhanced_prompt += "Return ALL detected languages as separate entries in the languages array. If multiple languages are present, list them ALL separately."
-
- # Use enhanced prompt with text-only model - with retry logic
- max_retries = 2
- retry_delay = 1
-
- for retry in range(max_retries):
- try:
- logger.info(f"Calling text model ({TEXT_MODEL})")
- api_start = time.time()
-
- # Set appropriate timeout based on text length
- timeout_ms = min(120000, max(30000, len(truncated_text) * 5)) # 30-120s based on length
-
- # Make API call with appropriate timeout
- chat_response = self.client.chat.parse(
- model=TEXT_MODEL,
- messages=[
- {
- "role": "user",
- "content": enhanced_prompt
- },
- ],
- response_format=StructuredOCRModel,
- temperature=0,
- timeout_ms=timeout_ms
- )
-
- api_time = time.time() - api_start
- logger.info(f"Text model API call completed in {api_time:.2f}s")
-
- # Convert the response to a dictionary
- result = json.loads(chat_response.choices[0].message.parsed.json())
-
- # Ensure languages is a list of strings, not Language enum objects
- if 'languages' in result:
- result['languages'] = [str(lang) for lang in result.get('languages', [])]
-
- # Add simplified processing metadata
- result['processing_method'] = 'text_model'
- result['model_used'] = TEXT_MODEL
- result['processing_time'] = time.time() - start_time
-
- # Flag when custom prompt has been successfully applied
- if custom_prompt:
- result['custom_prompt_applied'] = 'text_model'
-
- # Add raw text for reference if not already present
- if 'ocr_contents' in result and 'raw_text' not in result['ocr_contents']:
- # Add truncated raw text if very large
- if len(ocr_markdown) > 50000:
- result['ocr_contents']['raw_text'] = ocr_markdown[:50000] + "\n...[content truncated]..."
- else:
- result['ocr_contents']['raw_text'] = ocr_markdown
-
- return result
-
- except Exception as api_error:
- error_msg = str(api_error).lower()
- logger.warning(f"API error on attempt {retry+1}/{max_retries}: {str(api_error)}")
-
- # Check if retry would help
- if retry < max_retries - 1:
- # Rate limit errors - special handling with longer wait
- if any(term in error_msg for term in ["rate limit", "429", "too many requests", "requests rate limit exceeded"]):
- # Check specifically for token exhaustion vs temporary rate limit
- if any(term in error_msg for term in ["quota", "credit", "subscription"]):
- logger.error("API quota or credit limit reached. No retry will help.")
- raise ValueError(f"Mistral API quota or credit limit reached. Please check your subscription: {error_msg}")
- # Longer backoff for rate limit errors
- wait_time = retry_delay * (2 ** retry) * 6.0 # 6x longer wait for rate limits
- logger.info(f"Rate limit exceeded. Waiting {wait_time:.1f}s before retry...")
- time.sleep(wait_time)
- # Other transient errors
- elif any(term in error_msg for term in ["timeout", "connection", "500", "503", "504"]):
- # Wait before retrying
- wait_time = retry_delay * (2 ** retry)
- logger.info(f"Transient error, retrying in {wait_time}s")
- time.sleep(wait_time)
- else:
- # Non-retryable error
- raise
- else:
- # Last retry failed
- raise
+ # Convert the response to a dictionary
+ result = json.loads(chat_response.choices[0].message.parsed.json())
- # This shouldn't be reached due to raise in the loop, but just in case
- raise Exception("All retries failed for text model")
+ # Ensure languages is a list of strings, not Language enum objects
+ if 'languages' in result:
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
except Exception as e:
- logger.error(f"Text model failed: {str(e)}. Creating basic result.")
-
- # Create a basic result with available OCR text
- try:
- # Create a more informative fallback result
- result = {
- "file_name": filename,
- "topics": ["Document"],
- "languages": ["English"],
- "ocr_contents": {
- "raw_text": ocr_markdown[:50000] if ocr_markdown else "No text could be extracted",
- "error": "AI processing failed: " + str(e).replace('"', '\\"')
- },
- "processing_method": "fallback",
- "processing_error": str(e),
- "processing_time": time.time() - start_time
- }
-
- # No topic detection to avoid issue with document misclassification
-
- except Exception as inner_e:
- logger.error(f"Error creating basic result: {str(inner_e)}")
- result = {
- "file_name": str(filename) if filename else "unknown",
- "topics": ["Document"],
- "languages": ["English"],
- "ocr_contents": {
- "error": "Processing failed completely",
- "partial_text": ocr_markdown[:1000] if ocr_markdown else "Document could not be processed."
- }
+ # Create a basic result if parsing fails
+ print(f"Text model failed: {str(e)}. Creating basic result.")
+ result = {
+ "file_name": filename,
+ "topics": ["Document"],
+ "languages": ["English"],
+ "ocr_contents": {
+ "raw_text": ocr_markdown
}
+ }
return result
@@ -1991,4 +411,4 @@ if __name__ == "__main__":
processor = StructuredOCR()
result = processor.process_file(file_path)
- print(json.dumps(result, indent=2))
+ print(json.dumps(result, indent=2))
\ No newline at end of file
diff --git a/test_pdf.py b/test_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..370e4fb962d73a0644048029a300acad36ff2670
--- /dev/null
+++ b/test_pdf.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+Test script for pdf_ocr.py
+"""
+
+from pdf_ocr import PDFOCR
+import json
+import os
+
+def main():
+ # Initialize PDF processor
+ processor = PDFOCR()
+
+ # Define input and output paths
+ pdf_path = "input/rubric.pdf"
+ output_path = "output/rubric_test.json"
+
+ # Create output directory if it doesn't exist
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+ # Process PDF and save output
+ print(f"Processing PDF: {pdf_path}")
+ processor.save_json_output(pdf_path, output_path)
+ print(f"Output saved to: {output_path}")
+
+ # Read and print the output
+ with open(output_path, 'r') as f:
+ result = json.load(f)
+
+ print("\nOutput preview:")
+ print(f"File name: {result.get('file_name')}")
+ print(f"Topics: {result.get('topics')}")
+ print(f"Languages: {result.get('languages')}")
+ print("OCR contents preview (first few keys):")
+ ocr_contents = result.get('ocr_contents', {})
+ for i, (key, value) in enumerate(ocr_contents.items()):
+ if i >= 3: # Only show first 3 keys
+ break
+ print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/test_pdf_preview.py b/test_pdf_preview.py
new file mode 100644
index 0000000000000000000000000000000000000000..991f29993f32fc9aa5b9eac28bcd5abcfb23bdd6
--- /dev/null
+++ b/test_pdf_preview.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+"""
+Test PDF preview functionality
+"""
+import os
+import io
+from pathlib import Path
+from pdf2image import convert_from_path
+from PIL import Image
+
+def test_pdf_preview():
+ """Test converting a PDF to an image preview"""
+ # Get the first PDF file from the input directory
+ input_dir = Path(__file__).parent / "input"
+ pdf_files = list(input_dir.glob("*.pdf"))
+
+ if not pdf_files:
+ print("No PDF files found in the input directory")
+ return
+
+ pdf_path = pdf_files[0]
+ print(f"Testing PDF preview with file: {pdf_path}")
+
+ try:
+ # Convert first page of PDF to image
+ images = convert_from_path(pdf_path, first_page=1, last_page=1)
+
+ if not images:
+ print("No images extracted from PDF")
+ return
+
+ # Save the preview image
+ first_page = images[0]
+ output_dir = Path(__file__).parent / "output"
+ output_dir.mkdir(exist_ok=True)
+ output_path = output_dir / f"{pdf_path.stem}_preview.jpg"
+
+ first_page.save(output_path, format='JPEG')
+ print(f"PDF preview saved to: {output_path}")
+
+ # Demonstrate converting to bytes for Streamlit
+ img_bytes = io.BytesIO()
+ first_page.save(img_bytes, format='JPEG')
+ img_bytes.seek(0)
+ print(f"Successfully converted PDF to image bytes (size: {len(img_bytes.getvalue())} bytes)")
+
+ except Exception as e:
+ print(f"Error converting PDF to image: {str(e)}")
+
+if __name__ == "__main__":
+ test_pdf_preview()
\ No newline at end of file
diff --git a/ui/__init__.py b/ui/__init__.py
deleted file mode 100644
index 00d0c72780f0d3a3992493a13655d54dfd526bcc..0000000000000000000000000000000000000000
--- a/ui/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""UI package for Historical OCR."""
diff --git a/ui/__pycache__/layout.cpython-312.pyc b/ui/__pycache__/layout.cpython-312.pyc
index 429b8fdb06e42a047a988d80b79d05a242b39ee2..8195077b0f7c910a242f5a01ddae54a86d52c37e 100644
Binary files a/ui/__pycache__/layout.cpython-312.pyc and b/ui/__pycache__/layout.cpython-312.pyc differ
diff --git a/ui/__pycache__/layout.cpython-313.pyc b/ui/__pycache__/layout.cpython-313.pyc
deleted file mode 100644
index a908c03eaf7161c981a183fe9592df619a3877cc..0000000000000000000000000000000000000000
Binary files a/ui/__pycache__/layout.cpython-313.pyc and /dev/null differ
diff --git a/ui/custom.css b/ui/custom.css
index 34d6585972360cfd77b215d1e599f8c62f05f8c7..e701ad662756ea4df4382bc4e124401e4581bae8 100644
--- a/ui/custom.css
+++ b/ui/custom.css
@@ -1,252 +1,303 @@
-/* Custom CSS for Historical OCR Application */
-
-/* Global styles */
-body {
- font-family: 'Source Sans Pro', sans-serif;
- color: #333;
+/* Base Tailwind-like styles */
+:root {
+ --color-gray-900: #111827;
+ --color-gray-800: #1f2937;
+ --color-gray-700: #374151;
+ --color-gray-600: #4B5563;
+ --color-gray-500: #6B7280;
+ --color-gray-400: #9CA3AF;
+ --color-gray-300: #D1D5DB;
+ --color-gray-200: #E5E7EB;
+ --color-gray-100: #F3F4F6;
+ --color-gray-50: #F9FAFB;
+
+ --color-blue-900: #1E3A8A;
+ --color-blue-800: #1E40AF;
+ --color-blue-700: #1D4ED8;
+ --color-blue-600: #2563EB;
+ --color-blue-500: #3B82F6;
+ --color-blue-400: #60A5FA;
+ --color-blue-300: #93C5FD;
+ --color-blue-200: #BFDBFE;
+ --color-blue-100: #DBEAFE;
+ --color-blue-50: #EFF6FF;
+
+ --color-yellow-50: #FFFBEB;
+ --color-yellow-100: #FEF3C7;
+}
+
+/* Global Styles */
+.stApp {
+ background-color: var(--color-gray-900);
+ color: white;
}
-/* Header styles */
-h1, h2, h3, h4, h5, h6 {
- font-family: 'Georgia', serif;
- font-weight: 600;
- color: #1E3A8A;
+/* Main header */
+.main-header {
+ background-color: black;
+ padding: 1rem;
+ border-bottom: 1px solid var(--color-gray-700);
}
-/* Document content styling - with lower specificity to allow layout.py to override text formatting */
-.document-content {
- margin-top: 12px;
+.title-text {
+ font-size: 1.5rem;
+ font-weight: bold;
+ color: white;
}
-.document-section {
- margin-bottom: 12px;
- padding: 10px;
- background-color: #fff;
- border-radius: 8px;
- border: 1px solid #e0e0e0;
+/* Content containers */
+.content-container {
+ background-color: var(--color-gray-800);
+ border-radius: 0.75rem;
+ padding: 1.5rem;
+ margin-bottom: 1.5rem;
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.2);
+ color: white;
}
-/* Preserve headings style while allowing font to be overridden */
-.document-section h4 {
- margin-top: 0;
- margin-bottom: 10px;
- /* color moved to layout.py */
+.blue-container {
+ background-color: var(--color-blue-100);
+ color: var(--color-gray-900);
+ border-radius: 0.75rem;
+ padding: 1.5rem;
+ margin-bottom: 1.5rem;
}
-/* Subject tag styling - lower priority than layout.py versions */
-/* These styles will be overridden by the more specific selectors in layout.py */
-.subject-tag {
- /* Basic sizing only - styling comes from layout.py */
- display: inline-block;
- margin-right: 5px;
- margin-bottom: 5px;
+.yellow-container {
+ background-color: var(--color-yellow-50);
+ color: var(--color-gray-900);
+ border-radius: 0.75rem;
+ padding: 1.5rem;
+ margin-bottom: 1.5rem;
}
-/* Tag colors moved to layout.py with !important rules */
+/* Card grid styles */
+.card-grid {
+ display: grid;
+ grid-template-columns: repeat(1, 1fr);
+ gap: 1.5rem;
+ margin-bottom: 1.5rem;
+}
-/* Image and text side-by-side styling - layout only */
-.image-text-container {
- display: flex;
- gap: 20px;
- margin-bottom: 20px;
+@media (min-width: 768px) {
+ .card-grid {
+ grid-template-columns: repeat(3, 1fr);
+ }
}
-.image-container {
- flex: 1;
+.card {
+ background-color: var(--color-gray-700);
+ border-radius: 0.5rem;
+ padding: 1rem;
+ color: white;
}
-.text-container {
- flex: 1;
- /* Text styling will come from layout.py */
+/* Special containers */
+.key-concept {
+ background-color: var(--color-gray-700);
+ border-radius: 0.5rem;
+ padding: 0.75rem;
+ margin: 1rem 0;
+ border-left: 3px solid var(--color-blue-500);
+ color: white;
}
-/* Sidebar styling */
-.sidebar-section {
- margin-bottom: 20px;
+.research-question {
+ background-color: var(--color-blue-900);
+ border-radius: 0.5rem;
+ padding: 0.75rem;
+ margin: 1rem 0;
+ border-left: 3px solid var(--color-blue-400);
+ color: white;
}
-.sidebar-section h3 {
- margin-top: 0;
- margin-bottom: 10px;
- font-size: 16px;
+.quote-container {
+ font-style: italic;
+ color: var(--color-gray-300);
+ padding: 0.5rem 1rem;
+ border-left: 3px solid var(--color-gray-600);
+ margin: 1rem 0;
}
-/* Button styling */
-.primary-button {
- background-color: #1E88E5;
- color: white;
- border: none;
- border-radius: 4px;
- padding: 6px 12px;
- font-weight: 600;
- cursor: pointer;
- transition: background-color 0.2s;
+/* Navigation */
+.nav-container {
+ position: fixed;
+ bottom: 0;
+ left: 0;
+ width: 100%;
+ background-color: black;
+ border-top: 1px solid var(--color-gray-700);
+ padding: 0.75rem 1rem;
+ display: flex;
+ justify-content: space-between;
+ z-index: 1000;
}
-.primary-button:hover {
- background-color: #1565C0;
+.nav-buttons {
+ display: flex;
+ gap: 0.5rem;
}
-.secondary-button {
- background-color: #f8f9fa;
- color: #333;
- border: 1px solid #ddd;
- border-radius: 4px;
- padding: 6px 12px;
- font-weight: 600;
+.prev-button {
+ background-color: var(--color-gray-700);
+ color: white;
+ padding: 0.5rem 1rem;
+ border-radius: 0.25rem;
+ border: none;
cursor: pointer;
- transition: background-color 0.2s;
}
-.secondary-button:hover {
- background-color: #e9ecef;
+.prev-button:hover {
+ background-color: var(--color-gray-600);
}
-/* Processing status styling */
-.processing-status {
- padding: 8px 12px;
- border-left: 4px solid #1E88E5;
- background-color: #E3F2FD;
- border-radius: 0 4px 4px 0;
- margin: 8px 0;
- font-size: 14px;
+.next-button {
+ background-color: var(--color-blue-600);
+ color: white;
+ padding: 0.5rem 1rem;
+ border-radius: 0.25rem;
+ border: none;
+ cursor: pointer;
}
-/* Previous results styling */
-.previous-results-container {
- margin-top: 12px;
+.next-button:hover {
+ background-color: var(--color-blue-700);
}
-.result-card {
- background-color: transparent;
- border-radius: 8px;
- padding: 12px;
- margin-bottom: 12px;
- transition: all 0.2s ease;
- color: #333; /* Ensure text has good contrast with background */
+.nav-dots {
+ display: none;
}
-.result-card:hover {
- box-shadow: 0 4px 8px rgba(0,0,0,0.1);
- border-color: #c0c0c0;
+@media (min-width: 768px) {
+ .nav-dots {
+ display: flex;
+ gap: 0.25rem;
+ }
}
-.result-header {
+.nav-dot {
+ width: 2rem;
+ height: 2rem;
display: flex;
- justify-content: space-between;
- margin-bottom: 10px;
+ align-items: center;
+ justify-content: center;
+ color: var(--color-gray-300);
+ border-radius: 0.25rem;
+ text-decoration: none;
+ font-size: 0.875rem;
}
-.result-filename {
- font-weight: bold;
- font-size: 16px;
- color: #333; /* Explicit text color */
+.nav-dot:hover {
+ background-color: var(--color-gray-800);
}
-.result-date {
- color: #666;
- font-size: 14px;
+.nav-dot.active {
+ background-color: var(--color-blue-800);
+ color: white;
+ font-weight: 500;
}
-.result-metadata {
- margin-top: 10px;
- font-size: 14px;
- color: #333; /* Ensure metadata text has good contrast */
+/* Override Streamlit Styles */
+.stTextInput > div > div > input {
+ background-color: var(--color-gray-700);
+ color: white;
}
-.result-tag {
- margin-bottom: 5px;
- color: #555;
+.stSelectbox > div > div > div {
+ background-color: var(--color-gray-700);
+ color: white;
}
-.result-action-button {
- margin-top: 10px;
- text-align: right;
+.stCheckbox > div > label {
+ color: white;
}
-.selected-result-container {
- margin-top: 16px;
- padding: 12px;
- background-color: #f0f2f6;
- border-radius: 8px;
- border: 1px solid #d0d7de;
- color: #333; /* Ensure text has good contrast with background */
+/* Button styling */
+.stButton > button {
+ background-color: var(--color-blue-600);
+ color: white;
}
-.selected-result-title {
- font-size: 18px;
- font-weight: bold;
- color: #1E3A8A;
+.stButton > button:hover {
+ background-color: var(--color-blue-700);
}
-/* About tab styling */
-.about-section {
- margin-bottom: 16px;
+/* Sidebars */
+[data-testid="stSidebar"] {
+ background-color: var(--color-gray-900);
}
-.about-section h3 {
- color: #1E3A8A;
- margin-bottom: 10px;
+[data-testid="stSidebar"] .stMarkdown {
+ color: white;
}
-.feature-list {
- list-style-type: none;
- padding-left: 0;
+/* Module card styles */
+.module-card {
+ background-color: var(--color-gray-800);
+ border-radius: 0.5rem;
+ padding: 1rem;
+ margin-bottom: 1rem;
+ border-top: 4px solid var(--color-blue-500);
+ transition: transform 0.2s;
}
-.feature-list li {
- margin-bottom: 8px;
- padding-left: 20px;
- position: relative;
+.module-card:hover {
+ transform: translateY(-3px);
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
}
-.feature-list li:before {
- content: "•";
- position: absolute;
- left: 0;
- color: #1E88E5;
+.module-number {
+ background-color: var(--color-blue-500);
+ color: white;
+ font-weight: bold;
+ padding: 0.25rem 0.5rem;
+ border-radius: 1rem;
+ font-size: 0.9rem;
+ display: inline-block;
+ margin-bottom: 0.5rem;
}
-/* File uploader styling */
-.file-uploader {
- border: 2px dashed #ddd;
- border-radius: 8px;
- padding: 16px;
- text-align: center;
- transition: border-color 0.2s;
+.module-title {
+ font-weight: 600;
+ margin-bottom: 0.5rem;
+ font-size: 1.1rem;
}
-.file-uploader:hover {
- border-color: #1E88E5;
+/* Add space at bottom for fixed nav */
+.main-content {
+ padding-bottom: 4rem;
}
-/* Example documents styling */
-.example-documents {
- margin-top: 12px;
+/* Tool container styles */
+.tool-container {
+ background-color: var(--color-gray-800);
+ color: white;
+ padding: 1.5rem;
+ border-radius: 0.5rem;
+ border: 1px solid var(--color-gray-700);
+ margin-bottom: 1.5rem;
}
-.example-card {
- background-color: #f8f9fa;
- border-radius: 8px;
- padding: 12px;
- margin-bottom: 12px;
- border: 1px solid #e0e0e0;
- cursor: pointer;
- transition: all 0.2s ease;
+/* Upload container */
+.upload-container {
+ border: 2px dashed var(--color-gray-600);
+ padding: 1.5rem;
+ text-align: center;
+ border-radius: 0.5rem;
+ margin-bottom: 1rem;
+ background-color: var(--color-gray-700);
}
-.example-card:hover {
- box-shadow: 0 4px 8px rgba(0,0,0,0.1);
- border-color: #c0c0c0;
+/* Footer spacing */
+.footer-spacer {
+ height: 4rem;
}
-.example-title {
- font-weight: bold;
- font-size: 16px;
- margin-bottom: 5px;
+/* Tabs */
+.stTabs [data-baseweb="tab"] {
+ color: white;
}
-.example-description {
- font-size: 14px;
- color: #555;
-}
+.stTabs [data-baseweb="tab-highlight"] {
+ background-color: var(--color-blue-600);
+}
\ No newline at end of file
diff --git a/ui/layout.py b/ui/layout.py
index 24cb399938a8bf210cf673da41ef1f52c7d558fb..ebcdb8c19ffde05be13e8feec5b1184c13fc4f39 100644
--- a/ui/layout.py
+++ b/ui/layout.py
@@ -1,373 +1,172 @@
import streamlit as st
+from pathlib import Path
+import os
+# Load custom CSS
def load_css():
- """Load custom CSS for the application - inspired by mistral-ocr implementations"""
+ css_file = Path(__file__).parent / "custom.css"
+ if css_file.exists():
+ with open(css_file) as f:
+ st.markdown(f"", unsafe_allow_html=True)
+ else:
+ st.warning("Custom CSS file not found. Some styles may be missing.")
+
+# Header component
+def header():
st.markdown("""
-
+ st.markdown('
""", unsafe_allow_html=True)
+
+# Previous button HTML
+def prev_button_html(current_module, modules):
+ if current_module > 1:
+ prev_module = current_module - 1
+ return f"""
+
+ """
+ return ""
+
+# Next button HTML
+def next_button_html(current_module, modules):
+ if current_module < len(modules):
+ next_module = current_module + 1
+ return f"""
+
+ """
+ return ""
+
+# Navigation dots HTML
+def nav_dots_html(current_module, modules):
+ dots_html = ""
+ for i, name in enumerate(modules, 1):
+ active_class = "active" if i == current_module else ""
+ dots_html += f"""
+
+ {i}
+
+ """
+ return dots_html
+
+# Helper functions for container styles
+def gray_container(content, padding="1.5rem"):
+ """Renders content in a gray container with consistent styling"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def blue_container(content, padding="1.5rem"):
+ """Renders content in a blue container with consistent styling"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def yellow_container(content, padding="1.5rem"):
+ """Renders content in a yellow container with consistent styling"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def card_grid(cards):
+ """
+ Renders a responsive grid of cards
+ Args:
+ cards: List of HTML strings for each card
+ """
+ grid_html = '
', unsafe_allow_html=True)
+
+def research_question(content):
+ """Renders a research question box"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def quote(content, author=""):
+ """Renders a quote with optional author"""
+ quote_html = f'
{content}'
+ if author:
+ quote_html += f'
— {author}'
+ quote_html += '
'
+ st.markdown(quote_html, unsafe_allow_html=True)
+
+def tool_container(content):
+ """Renders content in a tool container"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
+
+def upload_container(content):
+ """Renders content in an upload container"""
+ st.markdown(f'
{content}
', unsafe_allow_html=True)
\ No newline at end of file
diff --git a/ui/ui_components.py b/ui/ui_components.py
deleted file mode 100644
index 8cce5949309dd92d81bcc764a80983850bdda746..0000000000000000000000000000000000000000
--- a/ui/ui_components.py
+++ /dev/null
@@ -1,590 +0,0 @@
-import streamlit as st
-import os
-import io
-import base64
-import logging
-import re
-from datetime import datetime
-from pathlib import Path
-import json
-
-# Define exports
-__all__ = [
- 'ProgressReporter',
- 'create_sidebar_options',
- 'create_file_uploader',
- 'display_document_with_images',
- 'display_previous_results',
- 'display_about_tab',
- 'display_results' # Re-export from utils.ui_utils
-]
-from constants import (
- DOCUMENT_TYPES,
- DOCUMENT_LAYOUTS,
- CUSTOM_PROMPT_TEMPLATES,
- LAYOUT_PROMPT_ADDITIONS,
- DEFAULT_PDF_DPI,
- MIN_PDF_DPI,
- MAX_PDF_DPI,
- DEFAULT_MAX_PAGES,
- PERFORMANCE_MODES,
- PREPROCESSING_DOC_TYPES,
- ROTATION_OPTIONS
-)
-from utils.text_utils import format_ocr_text, clean_raw_text, format_markdown_text # Import from text_utils
-from utils.content_utils import (
- classify_document_content,
- extract_document_text,
- extract_image_description
-)
-from utils.ui_utils import display_results
-from preprocessing import preprocess_image
-
-class ProgressReporter:
- """Class to handle progress reporting in the UI"""
-
- def __init__(self, placeholder):
- self.placeholder = placeholder
- self.progress_bar = None
- self.status_text = None
-
- def setup(self):
- """Setup the progress components"""
- with self.placeholder.container():
- self.progress_bar = st.progress(0)
- self.status_text = st.empty()
- return self
-
- def update(self, percent, status_text):
- """Update the progress bar and status text"""
- if self.progress_bar is not None:
- self.progress_bar.progress(percent / 100)
- if self.status_text is not None:
- self.status_text.text(status_text)
-
- def complete(self, success=True):
- """Complete the progress reporting"""
- if success:
- if self.progress_bar is not None:
- self.progress_bar.progress(100)
- if self.status_text is not None:
- self.status_text.text("Processing complete!")
- else:
- if self.status_text is not None:
- self.status_text.text("Processing failed.")
-
- # Clear the progress components after a delay
- import time
- time.sleep(0.8) # Short delay to show completion
- if self.progress_bar is not None:
- self.progress_bar.empty()
- if self.status_text is not None:
- self.status_text.empty()
-
-def create_sidebar_options():
- """Create and return sidebar options"""
- with st.sidebar:
- st.markdown("## OCR Settings")
-
- # Create a container for the sidebar options
- with st.container():
- # Default to using vision model (removed selection from UI)
- use_vision = True
-
- # Document type selection
- doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
- help="Select the type of document you're processing for better results")
-
- # Document layout
- doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
- help="Select the layout of your document")
-
- # Initialize preprocessing variables with default values
- grayscale = False
- denoise = False
- contrast = 0
- rotation = 0
- use_segmentation = False
-
- # Custom prompt
- custom_prompt = ""
- # Get the template for the selected document type if not auto-detect
- if doc_type != DOCUMENT_TYPES[0]:
- prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
-
- # Add layout information if not standard
- if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
- layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
- if layout_addition:
- prompt_template += " " + layout_addition
-
- # Set the custom prompt
- custom_prompt = prompt_template
-
- # Allow user to edit the prompt (always visible)
- custom_prompt = st.text_area("Custom Processing Instructions", value=custom_prompt,
- help="Customize the instructions for processing this document",
- height=80)
-
- # Image preprocessing options (always visible)
- st.markdown("### Image Preprocessing")
-
- # Grayscale conversion
- grayscale = st.checkbox("Convert to Grayscale",
- value=True,
- help="Convert color images to grayscale for better text recognition")
-
- # Light denoising option
- denoise = st.checkbox("Light Denoising",
- value=True,
- help="Apply gentle denoising to improve text clarity")
-
- # Contrast adjustment
- contrast = st.slider("Contrast Adjustment",
- min_value=-20,
- max_value=20,
- value=5,
- step=5,
- help="Adjust image contrast (limited range)")
-
-
- # Initialize rotation (keeping it set to 0)
- rotation = 0
- use_segmentation = False
-
- # Create preprocessing options dictionary
- # Map UI document types to preprocessing document types
- doc_type_for_preprocessing = "standard"
- if "Handwritten" in doc_type:
- doc_type_for_preprocessing = "handwritten"
- elif "Newspaper" in doc_type or "Magazine" in doc_type:
- doc_type_for_preprocessing = "newspaper"
- elif "Book" in doc_type or "Publication" in doc_type:
- doc_type_for_preprocessing = "book" # Match the actual preprocessing type
-
- preprocessing_options = {
- "document_type": doc_type_for_preprocessing,
- "grayscale": grayscale,
- "denoise": denoise,
- "contrast": contrast,
- "rotation": rotation
- }
-
- # PDF-specific options
- st.markdown("### PDF Options")
- max_pages = st.number_input("Maximum Pages to Process",
- min_value=1,
- max_value=20,
- value=DEFAULT_MAX_PAGES,
- help="Limit the number of pages to process (for multi-page PDFs)")
-
- # Set default values for removed options
- pdf_dpi = DEFAULT_PDF_DPI
- pdf_rotation = 0
-
- # Create options dictionary
- options = {
- "use_vision": use_vision,
- "perf_mode": "Quality", # Default to Quality, removed performance mode option
- "pdf_dpi": pdf_dpi,
- "max_pages": max_pages,
- "pdf_rotation": pdf_rotation,
- "custom_prompt": custom_prompt,
- "preprocessing_options": preprocessing_options,
- "use_segmentation": use_segmentation if 'use_segmentation' in locals() else False
- }
-
- return options
-
-def create_file_uploader():
- """Create and return a file uploader"""
- # Add app description
- st.markdown(f'
📜
Historical OCR
', unsafe_allow_html=True)
- st.markdown("
Made possible by Mistral AI
", unsafe_allow_html=True)
-
- # Add project framing
- st.markdown("""
- This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
- - **Historical newspapers** with complex layouts
- - **Handwritten documents** from various periods
- - **Photos of archival materials**
-
- Upload a document to begin, or explore the examples.
- """)
-
- # Create file uploader with a more concise label
- uploaded_file = st.file_uploader(
- "Select file",
- type=["pdf", "png", "jpg"],
- help="Upload a PDF or image file for OCR processing"
- )
- return uploaded_file
-
-def display_document_with_images(result):
- """Display document with images"""
- # Check for pages_data first
- if 'pages_data' in result and result['pages_data']:
- pages_data = result['pages_data']
- # If pages_data not available, try to extract from raw_response_data
- elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
- # Build pages_data from raw_response_data
- pages_data = []
- raw_pages = result['raw_response_data']['pages']
-
- for page_idx, page in enumerate(raw_pages):
- if not isinstance(page, dict):
- continue
-
- page_data = {
- 'page_number': page_idx + 1,
- 'markdown': page.get('markdown', ''),
- 'images': []
- }
-
- # Extract images if present
- if 'images' in page and isinstance(page['images'], list):
- for img_idx, img in enumerate(page['images']):
- if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
- img_base64 = img.get('image_base64', img.get('base64', ''))
- if img_base64:
- page_data['images'].append({
- 'id': img.get('id', f"img_{page_idx}_{img_idx}"),
- 'image_base64': img_base64
- })
-
- if page_data['markdown'] or page_data['images']:
- pages_data.append(page_data)
- else:
- st.info("No image data available.")
- return
-
- # Display each page
- for i, page_data in enumerate(pages_data):
- st.markdown(f"### Page {i+1}")
-
- # Display only the image (removed text column)
- # Display the image - check multiple possible field names
- image_displayed = False
-
- # Try 'image_data' field first
- if 'image_data' in page_data:
- try:
- # Convert base64 to image
- image_data = base64.b64decode(page_data['image_data'])
- st.image(io.BytesIO(image_data), use_container_width=True)
- image_displayed = True
- except Exception as e:
- st.error(f"Error displaying image from image_data: {str(e)}")
-
- # Try 'images' array if image_data didn't work
- if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
- for img in page_data['images']:
- if 'image_base64' in img:
- try:
- st.image(img['image_base64'], use_container_width=True)
- image_displayed = True
- break
- except Exception as e:
- st.error(f"Error displaying image from images array: {str(e)}")
-
- # Try alternative image source if still not displayed
- if not image_displayed and 'raw_response_data' in result:
- raw_data = result['raw_response_data']
- if isinstance(raw_data, dict) and 'pages' in raw_data:
- for raw_page in raw_data['pages']:
- if isinstance(raw_page, dict) and 'images' in raw_page:
- for img in raw_page['images']:
- if isinstance(img, dict) and 'base64' in img:
- st.image(img['base64'], use_container_width=True)
- st.caption("Image from OCR response")
- image_displayed = True
- break
- if image_displayed:
- break
-
- if not image_displayed:
- st.info("No image available for this page.")
-
- # Extract and display alt text if available
- page_text = ""
- if 'text' in page_data:
- page_text = page_data['text']
- elif 'markdown' in page_data:
- page_text = page_data['markdown']
-
- if page_text and page_text.startswith("![") and page_text.endswith(")"):
- try:
- alt_text = page_text[2:page_text.index(']')]
- if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
- st.caption(f"Image description: {alt_text}")
- except:
- pass
-
-def display_previous_results():
- """Display previous results tab content in a simplified, structured view"""
-
- # Use a simple header without the button column
- st.header("Previous Results")
-
- # Display previous results if available
- if not st.session_state.previous_results:
- st.markdown("""
-
-
📄
-
No Previous Results
-
Process a document to see your results history.
-
- """, unsafe_allow_html=True)
- else:
- # Prepare zip download outside of the UI flow
- try:
- # Create download button for all results
- from utils.image_utils import create_results_zip_in_memory
- zip_data = create_results_zip_in_memory(st.session_state.previous_results)
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-
- # Simplified filename
- zip_filename = f"ocr_results_{timestamp}.zip"
-
- # Encode the zip data for direct download link
- zip_b64 = base64.b64encode(zip_data).decode()
-
- # Add styled download tag in the metadata section
- download_html = '
'
- st.markdown(download_html, unsafe_allow_html=True)
- except Exception:
- # Silent fail - no error message to keep UI clean
- pass
-
- # Create a cleaner, more minimal grid for results using Streamlit columns
- # Calculate number of columns based on screen width - more responsive
- num_columns = 2 # Two columns for most screens
-
- # Create rows of result cards
- for i in range(0, len(st.session_state.previous_results), num_columns):
- # Create a row of columns
- cols = st.columns(num_columns)
-
- # Fill each column with a result card
- for j in range(num_columns):
- index = i + j
- if index < len(st.session_state.previous_results):
- result = st.session_state.previous_results[index]
-
- # Get basic info for the card
- file_name = result.get("file_name", f"Document {index+1}")
- timestamp = result.get("timestamp", "")
-
- # Determine file type icon
- if file_name.lower().endswith(".pdf"):
- icon = "📄"
- elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
- icon = "🖼️"
- else:
- icon = "📝"
-
- # Display a simplified card in each column
- with cols[j]:
- # Use a container for better styling control
- with st.container():
- # Create visually cleaner card with less vertical space
- st.markdown(f"""
-
-
-
{icon} {file_name}
-
{timestamp.split()[0] if timestamp else ""}
-
-
- """, unsafe_allow_html=True)
-
- # Add a simple button below each card
- if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
- st.session_state.selected_previous_result = st.session_state.previous_results[index]
- st.rerun()
-
- # Display the selected result if available
- if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
- selected_result = st.session_state.selected_previous_result
-
- # Draw a separator between results list and selected document
- st.markdown("", unsafe_allow_html=True)
-
- # Create a cleaner header for the selected document
- file_name = selected_result.get('file_name', 'Document')
- st.subheader(f"{file_name}")
-
- # Add a simple back button at the top
- if st.button("← Back to Results", key="back_to_results"):
- if 'selected_previous_result' in st.session_state:
- del st.session_state.selected_previous_result
- st.session_state.perform_reset = True
- st.rerun()
-
- # Simplified metadata display - just one line with essential info
- meta_html = '
'
-
- # Add timestamp
- if 'timestamp' in selected_result:
- meta_html += f'
{selected_result["timestamp"]}
'
-
- # Add languages if available (simplified)
- if 'languages' in selected_result and selected_result['languages']:
- languages = [lang for lang in selected_result['languages'] if lang is not None]
- if languages:
- meta_html += f'
Language: {", ".join(languages)}
'
-
- # Add page count if available (simplified)
- if 'limited_pages' in selected_result:
- meta_html += f'
'
- st.markdown(meta_html, unsafe_allow_html=True)
-
- # Simplified tabs - using the same format as main view
- has_images = selected_result.get('has_images', False)
- if has_images:
- view_tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
- view_tab1, view_tab2, view_tab3 = view_tabs
- else:
- view_tabs = st.tabs(["Document Content", "Raw JSON"])
- view_tab1, view_tab2 = view_tabs
- view_tab3 = None
-
- # First tab - Document Content (simplified structured view)
- with view_tab1:
- # Display content in a cleaner, more streamlined format
- if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
- # Create a more focused list of important sections
- priority_sections = ["title", "content", "transcript", "summary"]
- displayed_sections = set()
-
- # First display priority sections
- for section in priority_sections:
- if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
- content = selected_result['ocr_contents'][section]
- if isinstance(content, str) and content.strip():
- # Only add a subheader for meaningful section names, not raw_text
- if section != "raw_text":
- st.markdown(f"##### {section.replace('_', ' ').title()}")
-
- # Format and display content
- formatted_content = format_ocr_text(content, for_display=True)
- st.markdown(formatted_content)
- displayed_sections.add(section)
-
- # Then display any remaining sections not already shown
- for section, content in selected_result['ocr_contents'].items():
- if (section not in displayed_sections and
- section not in ['error', 'partial_text'] and
- content):
- st.markdown(f"##### {section.replace('_', ' ').title()}")
-
- if isinstance(content, str):
- st.markdown(format_ocr_text(content, for_display=True))
- elif isinstance(content, list):
- for item in content:
- st.markdown(f"- {item}")
- elif isinstance(content, dict):
- for k, v in content.items():
- st.markdown(f"**{k}:** {v}")
-
- # Second tab - Raw JSON (simplified)
- with view_tab2:
- # Extract the relevant JSON data
- json_data = {}
-
- # Include important metadata
- for field in ['file_name', 'timestamp', 'processing_time', 'title', 'languages', 'topics', 'subjects', 'text',' raw_text']:
- if field in selected_result:
- json_data[field] = selected_result[field]
-
- # Include OCR contents
- if 'ocr_contents' in selected_result:
- json_data['ocr_contents'] = selected_result['ocr_contents']
-
- # Format the JSON prettily
- json_str = json.dumps(json_data, indent=2)
-
- # Display in a monospace font with syntax highlighting
- st.code(json_str, language="json")
-
- # Third tab - Images (simplified)
- if has_images and view_tab3 is not None:
- with view_tab3:
- # Simplified image display
- if 'pages_data' in selected_result:
- for i, page_data in enumerate(selected_result['pages_data']):
- # Display each page
- if 'images' in page_data and len(page_data['images']) > 0:
- for img in page_data['images']:
- if 'image_base64' in img:
- st.image(img['image_base64'], use_container_width=True)
-
- # Get page text if available
- page_text = ""
- if 'markdown' in page_data:
- page_text = page_data['markdown']
-
- # Display text if available
- if page_text:
- with st.expander(f"Page {i+1} Text", expanded=False):
- st.text(page_text)
-
-def display_about_tab():
- """Display learn more tab content"""
- st.header("Learn More")
-
- # Add app description
- st.markdown("""
- **Historical OCR** is a tailored academic tool for extracting text from historical documents, manuscripts, and printed materials.
- """)
-
- # Purpose section with consistent formatting
- st.markdown("### Purpose")
- st.markdown("""
- This tool is designed to assist scholars in historical research by extracting text from challenging documents.
- While it may not achieve full accuracy for all materials, it serves as a tailored research aid for navigating
- historical documents, particularly:
- """)
-
- st.markdown("""
- - **Historical newspapers** with complex layouts and aged text
- - **Handwritten documents** from various time periods
- - **Photos of archival materials** that may be difficult to read
- """)
-
- # Features section with consistent formatting
- st.markdown("### Features")
- st.markdown("""
- - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
- - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
- - **Editable Results**: Review and edit extracted text directly in the interface
- - **Structured Content Analysis**: Automatic organization of document content
- - **Multi-language Support**: Process documents in various languages
- - **PDF Processing**: Handle multi-page historical documents
- """)
-
- # How to Use section with consistent formatting
- st.markdown("### How to Use")
- st.markdown("""
- 1. Upload a document (PDF or image)
- 2. Select the document type and adjust preprocessing options if needed
- 3. Add custom processing instructions for specialized documents
- 4. Process the document
- 5. Review, edit, and download the results
- """)
-
- # Technologies section with consistent formatting
- st.markdown("### Technologies")
- st.markdown("""
- - OCR processing using Mistral AI's advanced document understanding capabilities
- - Image preprocessing with OpenCV
- - PDF handling with pdf2image
- - Web interface with Streamlit
- """)
-
- # Add version information
- st.markdown("**Version:** 2.0.0")
diff --git a/ui_components.py b/ui_components.py
deleted file mode 100644
index 8b7d7a812ba6b809854f9751f6e6d3b112faa9fe..0000000000000000000000000000000000000000
--- a/ui_components.py
+++ /dev/null
@@ -1,604 +0,0 @@
-import streamlit as st
-import os
-import io
-import base64
-import logging
-import re
-from datetime import datetime
-from pathlib import Path
-import json
-
-# Define exports
-__all__ = [
- 'ProgressReporter',
- 'create_sidebar_options',
- 'create_file_uploader',
- 'display_document_with_images',
- 'display_previous_results',
- 'display_about_tab',
- 'display_results' # Re-export from utils.ui_utils
-]
-from constants import (
- DOCUMENT_TYPES,
- DOCUMENT_LAYOUTS,
- CUSTOM_PROMPT_TEMPLATES,
- LAYOUT_PROMPT_ADDITIONS,
- DEFAULT_PDF_DPI,
- MIN_PDF_DPI,
- MAX_PDF_DPI,
- DEFAULT_MAX_PAGES,
- PERFORMANCE_MODES,
- PREPROCESSING_DOC_TYPES,
- ROTATION_OPTIONS
-)
-from utils.text_utils import format_ocr_text, clean_raw_text, format_markdown_text # Import from text_utils
-from utils.content_utils import (
- classify_document_content,
- extract_document_text,
- extract_image_description
-)
-from utils.ui_utils import display_results
-from preprocessing import preprocess_image
-
-class ProgressReporter:
- """Class to handle progress reporting in the UI"""
-
- def __init__(self, placeholder):
- self.placeholder = placeholder
- self.progress_bar = None
- self.status_text = None
-
- def setup(self):
- """Setup the progress components"""
- with self.placeholder.container():
- self.progress_bar = st.progress(0)
- self.status_text = st.empty()
- return self
-
- def update(self, percent, status_text):
- """Update the progress bar and status text"""
- if self.progress_bar is not None:
- self.progress_bar.progress(percent / 100)
- if self.status_text is not None:
- self.status_text.text(status_text)
-
- def complete(self, success=True):
- """Complete the progress reporting"""
- if success:
- if self.progress_bar is not None:
- self.progress_bar.progress(100)
- if self.status_text is not None:
- self.status_text.text("Processing complete!")
- else:
- if self.status_text is not None:
- self.status_text.text("Processing failed.")
-
- # Clear the progress components after a delay
- import time
- time.sleep(0.8) # Short delay to show completion
- if self.progress_bar is not None:
- self.progress_bar.empty()
- if self.status_text is not None:
- self.status_text.empty()
-
-def create_sidebar_options():
- """Create and return sidebar options"""
- with st.sidebar:
- st.markdown("## OCR Settings")
-
- # Create a container for the sidebar options
- with st.container():
- # Default to using vision model (removed selection from UI)
- use_vision = True
-
- # Document type selection
- doc_type = st.selectbox("Document Type", DOCUMENT_TYPES,
- help="Select the type of document you're processing for better results")
-
- # Document layout
- doc_layout = st.selectbox("Document Layout", DOCUMENT_LAYOUTS,
- help="Select the layout of your document")
-
- # Initialize preprocessing variables with default values
- grayscale = False
- denoise = False
- contrast = 0
- rotation = 0
- use_segmentation = False
-
- # Custom prompt
- custom_prompt = ""
- # Get the template for the selected document type if not auto-detect
- if doc_type != DOCUMENT_TYPES[0]:
- prompt_template = CUSTOM_PROMPT_TEMPLATES.get(doc_type, "")
-
- # Add layout information if not standard
- if doc_layout != DOCUMENT_LAYOUTS[0]: # Not standard layout
- layout_addition = LAYOUT_PROMPT_ADDITIONS.get(doc_layout, "")
- if layout_addition:
- prompt_template += " " + layout_addition
-
- # Set the custom prompt
- custom_prompt = prompt_template
-
- # Allow user to edit the prompt (always visible)
- custom_prompt = st.text_area("Custom Processing Instructions", value=custom_prompt,
- help="Customize the instructions for processing this document",
- height=80)
-
- # Image preprocessing options (always visible)
- st.markdown("### Image Preprocessing")
-
- # Grayscale conversion
- grayscale = st.checkbox("Convert to Grayscale",
- value=True,
- help="Convert color images to grayscale for better text recognition")
-
- # Light denoising option
- denoise = st.checkbox("Light Denoising",
- value=True,
- help="Apply gentle denoising to improve text clarity")
-
- # Contrast adjustment
- contrast = st.slider("Contrast Adjustment",
- min_value=-20,
- max_value=20,
- value=5,
- step=5,
- help="Adjust image contrast (limited range)")
-
-
- # Initialize rotation (keeping it set to 0)
- rotation = 0
- use_segmentation = False
-
- # Create preprocessing options dictionary
- # Map UI document types to preprocessing document types
- doc_type_for_preprocessing = "standard"
- if "Handwritten" in doc_type:
- doc_type_for_preprocessing = "handwritten"
- elif "Newspaper" in doc_type or "Magazine" in doc_type:
- doc_type_for_preprocessing = "newspaper"
- elif "Book" in doc_type or "Publication" in doc_type:
- doc_type_for_preprocessing = "book" # Match the actual preprocessing type
-
- preprocessing_options = {
- "document_type": doc_type_for_preprocessing,
- "grayscale": grayscale,
- "denoise": denoise,
- "contrast": contrast,
- "rotation": rotation
- }
-
- # PDF-specific options
- st.markdown("### PDF Options")
- max_pages = st.number_input("Maximum Pages to Process",
- min_value=1,
- max_value=20,
- value=DEFAULT_MAX_PAGES,
- help="Limit the number of pages to process (for multi-page PDFs)")
-
- # Set default values for removed options
- pdf_dpi = DEFAULT_PDF_DPI
- pdf_rotation = 0
-
- # Create options dictionary
- options = {
- "use_vision": use_vision,
- "perf_mode": "Quality", # Default to Quality, removed performance mode option
- "pdf_dpi": pdf_dpi,
- "max_pages": max_pages,
- "pdf_rotation": pdf_rotation,
- "custom_prompt": custom_prompt,
- "preprocessing_options": preprocessing_options,
- "use_segmentation": use_segmentation if 'use_segmentation' in locals() else False
- }
-
- return options
-
-def create_file_uploader():
- """Create and return a file uploader"""
- # Add app description
- st.markdown(f'
📜
Historical OCR
', unsafe_allow_html=True)
- st.markdown("
Made possible by Mistral AI
", unsafe_allow_html=True)
-
- # Add project framing
- st.markdown("""
- This tool assists scholars in historical research by extracting text from challenging documents. While it may not achieve 100% accuracy, it helps navigate:
- - **Historical newspapers** with complex layouts
- - **Handwritten documents** from various periods
- - **Photos of archival materials**
-
- Upload a document to begin, or explore the examples.
- """)
-
- # Create file uploader with a more concise label
- uploaded_file = st.file_uploader(
- "Select file",
- type=["pdf", "png", "jpg"],
- help="Upload a PDF or image file for OCR processing"
- )
- return uploaded_file
-
-def display_document_with_images(result):
- """Display document with images"""
- # Check for pages_data first
- if 'pages_data' in result and result['pages_data']:
- pages_data = result['pages_data']
- # If pages_data not available, try to extract from raw_response_data
- elif 'raw_response_data' in result and isinstance(result['raw_response_data'], dict) and 'pages' in result['raw_response_data']:
- # Build pages_data from raw_response_data
- pages_data = []
- raw_pages = result['raw_response_data']['pages']
-
- for page_idx, page in enumerate(raw_pages):
- if not isinstance(page, dict):
- continue
-
- page_data = {
- 'page_number': page_idx + 1,
- 'markdown': page.get('markdown', ''),
- 'images': []
- }
-
- # Extract images if present
- if 'images' in page and isinstance(page['images'], list):
- for img_idx, img in enumerate(page['images']):
- if isinstance(img, dict) and ('base64' in img or 'image_base64' in img):
- img_base64 = img.get('image_base64', img.get('base64', ''))
- if img_base64:
- page_data['images'].append({
- 'id': img.get('id', f"img_{page_idx}_{img_idx}"),
- 'image_base64': img_base64
- })
-
- if page_data['markdown'] or page_data['images']:
- pages_data.append(page_data)
- else:
- st.info("No image data available.")
- return
-
- # Display each page
- for i, page_data in enumerate(pages_data):
- st.markdown(f"### Page {i+1}")
-
- # Display only the image (removed text column)
- # Display the image - check multiple possible field names
- image_displayed = False
-
- # Try 'image_data' field first
- if 'image_data' in page_data:
- try:
- # Convert base64 to image
- image_data = base64.b64decode(page_data['image_data'])
- st.image(io.BytesIO(image_data), use_container_width=True)
- image_displayed = True
- except Exception as e:
- st.error(f"Error displaying image from image_data: {str(e)}")
-
- # Try 'images' array if image_data didn't work
- if not image_displayed and 'images' in page_data and len(page_data['images']) > 0:
- for img in page_data['images']:
- if 'image_base64' in img:
- try:
- st.image(img['image_base64'], use_container_width=True)
- image_displayed = True
- break
- except Exception as e:
- st.error(f"Error displaying image from images array: {str(e)}")
-
- # Try alternative image source if still not displayed
- if not image_displayed and 'raw_response_data' in result:
- raw_data = result['raw_response_data']
- if isinstance(raw_data, dict) and 'pages' in raw_data:
- for raw_page in raw_data['pages']:
- if isinstance(raw_page, dict) and 'images' in raw_page:
- for img in raw_page['images']:
- if isinstance(img, dict) and 'base64' in img:
- st.image(img['base64'], use_container_width=True)
- st.caption("Image from OCR response")
- image_displayed = True
- break
- if image_displayed:
- break
-
- if not image_displayed:
- st.info("No image available for this page.")
-
- # Extract and display alt text if available
- page_text = ""
- if 'text' in page_data:
- page_text = page_data['text']
- elif 'markdown' in page_data:
- page_text = page_data['markdown']
-
- if page_text and page_text.startswith("![") and page_text.endswith(")"):
- try:
- alt_text = page_text[2:page_text.index(']')]
- if alt_text and len(alt_text) > 5: # Only show if alt text is meaningful
- st.caption(f"Image description: {alt_text}")
- except:
- pass
-
-def display_previous_results():
- """Display previous results tab content in a simplified, structured view"""
-
- # Use a simple header without the button column
- st.header("Previous Results")
-
- # Display previous results if available
- if not st.session_state.previous_results:
- st.markdown("""
-
-
📄
-
No Previous Results
-
Process a document to see your results history.
-
- """, unsafe_allow_html=True)
- else:
- # Prepare zip download outside of the UI flow
- try:
- # Create download button for all results
- from utils.image_utils import create_results_zip_in_memory
- zip_data = create_results_zip_in_memory(st.session_state.previous_results)
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-
- # Simplified filename
- zip_filename = f"ocr_results_{timestamp}.zip"
-
- # Encode the zip data for direct download link
- zip_b64 = base64.b64encode(zip_data).decode()
-
- # Add styled download tag in the metadata section
- download_html = '
'
- st.markdown(download_html, unsafe_allow_html=True)
- except Exception:
- # Silent fail - no error message to keep UI clean
- pass
-
- # Create a cleaner, more minimal grid for results using Streamlit columns
- # Calculate number of columns based on screen width - more responsive
- num_columns = 2 # Two columns for most screens
-
- # Create rows of result cards
- for i in range(0, len(st.session_state.previous_results), num_columns):
- # Create a row of columns
- cols = st.columns(num_columns)
-
- # Fill each column with a result card
- for j in range(num_columns):
- index = i + j
- if index < len(st.session_state.previous_results):
- result = st.session_state.previous_results[index]
-
- # Get basic info for the card
- file_name = result.get("file_name", f"Document {index+1}")
- timestamp = result.get("timestamp", "")
-
- # Determine file type icon
- if file_name.lower().endswith(".pdf"):
- icon = "📄"
- elif any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".gif"]):
- icon = "🖼️"
- else:
- icon = "📝"
-
- # Display a simplified card in each column
- with cols[j]:
- # Use a container for better styling control
- with st.container():
- # Create visually cleaner card with less vertical space
- st.markdown(f"""
-
-
-
{icon} {file_name}
-
{timestamp.split()[0] if timestamp else ""}
-
-
- """, unsafe_allow_html=True)
-
- # Add a simple button below each card
- if st.button(f"View", key=f"view_{index}", help=f"View {file_name}"):
- st.session_state.selected_previous_result = st.session_state.previous_results[index]
- st.rerun()
-
- # Display the selected result if available
- if 'selected_previous_result' in st.session_state and st.session_state.selected_previous_result:
- selected_result = st.session_state.selected_previous_result
-
- # Draw a separator between results list and selected document
- st.markdown("", unsafe_allow_html=True)
-
- # Create a cleaner header for the selected document
- file_name = selected_result.get('file_name', 'Document')
- st.subheader(f"{file_name}")
-
- # Add a simple back button at the top
- if st.button("← Back to Results", key="back_to_results"):
- if 'selected_previous_result' in st.session_state:
- del st.session_state.selected_previous_result
- st.session_state.perform_reset = True
- st.rerun()
-
- # Simplified metadata display - just one line with essential info
- meta_html = '
'
-
- # Add timestamp
- if 'timestamp' in selected_result:
- meta_html += f'
{selected_result["timestamp"]}
'
-
- # Add languages if available (simplified)
- if 'languages' in selected_result and selected_result['languages']:
- languages = [lang for lang in selected_result['languages'] if lang is not None]
- if languages:
- meta_html += f'
Language: {", ".join(languages)}
'
-
- # Add page count if available (simplified)
- if 'limited_pages' in selected_result:
- meta_html += f'
'
- st.markdown(meta_html, unsafe_allow_html=True)
-
- # Simplified tabs - using the same format as main view
- has_images = selected_result.get('has_images', False)
- if has_images:
- view_tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
- view_tab1, view_tab2, view_tab3 = view_tabs
- else:
- view_tabs = st.tabs(["Document Content", "Raw JSON"])
- view_tab1, view_tab2 = view_tabs
- view_tab3 = None
-
- # First tab - Document Content (simplified structured view)
- with view_tab1:
- # Display content in a cleaner, more streamlined format
- if 'ocr_contents' in selected_result and isinstance(selected_result['ocr_contents'], dict):
- # Create a more focused list of important sections
- priority_sections = ["title", "content", "transcript", "summary"]
- displayed_sections = set()
-
- # First display priority sections
- for section in priority_sections:
- if section in selected_result['ocr_contents'] and selected_result['ocr_contents'][section]:
- content = selected_result['ocr_contents'][section]
- if isinstance(content, str) and content.strip():
- # Only add a subheader for meaningful section names, not raw_text
- if section != "raw_text":
- st.markdown(f"##### {section.replace('_', ' ').title()}")
-
- # Format and display content
- formatted_content = format_ocr_text(content, for_display=True)
- st.markdown(formatted_content)
- displayed_sections.add(section)
-
- # Then display any remaining sections not already shown
- for section, content in selected_result['ocr_contents'].items():
- if (section not in displayed_sections and
- section not in ['error', 'partial_text'] and
- content):
- st.markdown(f"##### {section.replace('_', ' ').title()}")
-
- if isinstance(content, str):
- st.markdown(format_ocr_text(content, for_display=True))
- elif isinstance(content, list):
- for item in content:
- st.markdown(f"- {item}")
- elif isinstance(content, dict):
- for k, v in content.items():
- st.markdown(f"**{k}:** {v}")
-
- # Second tab - Raw JSON (simplified)
- with view_tab2:
- # Extract the relevant JSON data
- json_data = {}
-
- # Include important metadata
- for field in ['file_name', 'timestamp', 'processing_time', 'languages', 'topics', 'subjects', 'detected_document_type', 'text']:
- if field in selected_result:
- json_data[field] = selected_result[field]
-
- # Include OCR contents
- if 'ocr_contents' in selected_result:
- json_data['ocr_contents'] = selected_result['ocr_contents']
-
- # Exclude large binary data like base64 images to keep JSON clean
- if 'pages_data' in selected_result:
- # Create simplified pages_data without large binary content
- simplified_pages = []
- for page in selected_result['pages_data']:
- simplified_page = {
- 'page_number': page.get('page_number', 0),
- 'has_text': bool(page.get('markdown', '')),
- 'has_images': bool(page.get('images', [])),
- 'image_count': len(page.get('images', []))
- }
- simplified_pages.append(simplified_page)
- json_data['pages_summary'] = simplified_pages
-
- # Format the JSON prettily
- json_str = json.dumps(json_data, indent=2)
-
- # Display in a monospace font with syntax highlighting
- st.code(json_str, language="json")
-
- # Third tab - Images (simplified)
- if has_images and view_tab3 is not None:
- with view_tab3:
- # Simplified image display
- if 'pages_data' in selected_result:
- for i, page_data in enumerate(selected_result['pages_data']):
- # Display each page
- if 'images' in page_data and len(page_data['images']) > 0:
- for img in page_data['images']:
- if 'image_base64' in img:
- st.image(img['image_base64'], use_container_width=True)
-
- # Get page text if available
- page_text = ""
- if 'markdown' in page_data:
- page_text = page_data['markdown']
-
- # Display text if available
- if page_text:
- with st.expander(f"Page {i+1} Text", expanded=False):
- st.text(page_text)
-
-def display_about_tab():
- """Display learn more tab content"""
- st.header("Learn More")
-
- # Add app description
- st.markdown("""
- **Historical OCR** is a tailored academic tool for extracting text from historical documents, manuscripts, and printed materials.
- """)
-
- # Purpose section with consistent formatting
- st.markdown("### Purpose")
- st.markdown("""
- This tool is designed to assist scholars in historical research by extracting text from challenging documents.
- While it may not achieve full accuracy for all materials, it serves as a tailored research aid for navigating
- historical documents, particularly:
- """)
-
- st.markdown("""
- - **Historical newspapers** with complex layouts and aged text
- - **Handwritten documents** from various time periods
- - **Photos of archival materials** that may be difficult to read
- """)
-
- # Features section with consistent formatting
- st.markdown("### Features")
- st.markdown("""
- - **Advanced Image Preprocessing**: Optimize historical documents for better OCR results
- - **Custom Document Type Processing**: Specialized handling for newspapers, letters, books, and more
- - **Editable Results**: Review and edit extracted text directly in the interface
- - **Structured Content Analysis**: Automatic organization of document content
- - **Multi-language Support**: Process documents in various languages
- - **PDF Processing**: Handle multi-page historical documents
- """)
-
- # How to Use section with consistent formatting
- st.markdown("### How to Use")
- st.markdown("""
- 1. Upload a document (PDF or image)
- 2. Select the document type and adjust preprocessing options if needed
- 3. Add custom processing instructions for specialized documents
- 4. Process the document
- 5. Review, edit, and download the results
- """)
-
- # Technologies section with consistent formatting
- st.markdown("### Technologies")
- st.markdown("""
- - OCR processing using Mistral AI's advanced document understanding capabilities
- - Image preprocessing with OpenCV
- - PDF handling with pdf2image
- - Web interface with Streamlit
- """)
-
- # Add version information
- st.markdown("**Version:** 1.0.0")
diff --git a/utils.py b/utils.py
deleted file mode 100644
index 88a47ecc79968faa526b7b597dff14d19e91ce79..0000000000000000000000000000000000000000
--- a/utils.py
+++ /dev/null
@@ -1,414 +0,0 @@
-import os
-import base64
-import hashlib
-import time
-import logging
-from datetime import datetime
-from pathlib import Path
-from functools import wraps
-from constants import CONTENT_THEMES, PERIOD_TAGS, DEFAULT_TAGS, GENERIC_TAGS
-
-# Configure logging
-logger = logging.getLogger("utils")
-logger.setLevel(logging.INFO)
-
-def get_base64_from_image(image_path):
- """
- Get base64 data URL from image file with proper MIME type.
-
- Args:
- image_path: Path to the image file
-
- Returns:
- Base64 data URL with appropriate MIME type prefix
- """
- try:
- # Convert to Path object for better handling
- path_obj = Path(image_path)
-
- # Determine mime type based on file extension
- mime_type = 'image/jpeg' # Default mime type
- suffix = path_obj.suffix.lower()
- if suffix == '.png':
- mime_type = 'image/png'
- elif suffix == '.gif':
- mime_type = 'image/gif'
- elif suffix in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif suffix == '.pdf':
- mime_type = 'application/pdf'
-
- # Read and encode file
- with open(path_obj, "rb") as file:
- encoded = base64.b64encode(file.read()).decode('utf-8')
- return f"data:{mime_type};base64,{encoded}"
- except Exception as e:
- logger.error(f"Error encoding file to base64: {str(e)}")
- return ""
-
-def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
- """
- Get base64 data URL from file bytes with proper MIME type.
-
- Args:
- file_bytes: Binary file data
- mime_type: MIME type of the file (optional)
- file_name: Original file name for MIME type detection (optional)
-
- Returns:
- Base64 data URL with appropriate MIME type prefix
- """
- try:
- # Determine mime type if not provided
- if mime_type is None and file_name is not None:
- # Get file extension
- suffix = Path(file_name).suffix.lower()
- if suffix == '.png':
- mime_type = 'image/png'
- elif suffix == '.gif':
- mime_type = 'image/gif'
- elif suffix in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif suffix == '.pdf':
- mime_type = 'application/pdf'
- else:
- # Default to octet-stream for unknown types
- mime_type = 'application/octet-stream'
- elif mime_type is None:
- # Default MIME type if we can't determine it
- mime_type = 'application/octet-stream'
-
- # Encode and create data URL
- encoded = base64.b64encode(file_bytes).decode('utf-8')
- return f"data:{mime_type};base64,{encoded}"
- except Exception as e:
- logger.error(f"Error encoding bytes to base64: {str(e)}")
- return ""
-
-def timing(description):
- """Context manager for timing code execution"""
- class TimingContext:
- def __init__(self, description):
- self.description = description
-
- def __enter__(self):
- self.start_time = time.time()
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- end_time = time.time()
- execution_time = end_time - self.start_time
- logger.info(f"{self.description} took {execution_time:.2f} seconds")
- return False
-
- return TimingContext(description)
-
-def format_timestamp(timestamp=None, for_filename=False):
- """
- Format timestamp for display or filenames
-
- Args:
- timestamp: Datetime object or string to format (defaults to current time)
- for_filename: Whether to format for use in a filename (defaults to False)
-
- Returns:
- str: Formatted timestamp
- """
- if timestamp is None:
- timestamp = datetime.now()
- elif isinstance(timestamp, str):
- try:
- timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
- except ValueError:
- timestamp = datetime.now()
-
- if for_filename:
- # Format suitable for filenames: "Apr 30, 2025"
- return timestamp.strftime("%b %d, %Y")
- else:
- # Standard format for display
- return timestamp.strftime("%Y-%m-%d %H:%M")
-
-def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
- """
- Generate a cache key for OCR processing
-
- Args:
- file_bytes: File content as bytes
- file_type: Type of file (pdf or image)
- use_vision: Whether to use vision model
- preprocessing_options: Dictionary of preprocessing options
- pdf_rotation: PDF rotation value
- custom_prompt: Custom prompt for OCR
-
- Returns:
- str: Cache key
- """
- # Generate file hash
- file_hash = hashlib.md5(file_bytes).hexdigest()
-
- # Include preprocessing options in cache key
- preprocessing_options_hash = ""
- if preprocessing_options:
- # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
- if pdf_rotation != 0:
- preprocessing_options_with_rotation = preprocessing_options.copy()
- preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
- preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
- else:
- preprocessing_str = str(sorted(preprocessing_options.items()))
- preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
- elif pdf_rotation != 0:
- # If no preprocessing options but we have rotation, include that in the hash
- preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
-
- # Create base cache key
- cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
-
- # Include custom prompt in cache key if provided
- if custom_prompt:
- custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
- cache_key = f"{cache_key}_{custom_prompt_hash}"
-
- return cache_key
-
-def handle_temp_files(temp_file_paths):
- """
- Clean up temporary files
-
- Args:
- temp_file_paths: List of temporary file paths to clean up
- """
- for temp_path in temp_file_paths:
- try:
- if os.path.exists(temp_path):
- os.unlink(temp_path)
- logger.info(f"Removed temporary file: {temp_path}")
- except Exception as e:
- logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
-
-def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
- """
- Create a user-friendly descriptive filename for the result
-
- Args:
- original_filename: Original filename
- result: OCR result dictionary
- file_ext: File extension
- preprocessing_options: Dictionary of preprocessing options
-
- Returns:
- str: Human-readable descriptive filename
- """
- from datetime import datetime
-
- # Get base name without extension and capitalize words
- original_name = Path(original_filename).stem
-
- # Make the original name more readable by replacing dashes and underscores with spaces
- # Then capitalize each word
- readable_name = original_name.replace('-', ' ').replace('_', ' ')
- # Split by spaces and capitalize each word, then rejoin
- name_parts = readable_name.split()
- readable_name = ' '.join(word.capitalize() for word in name_parts)
-
- # Determine document type
- doc_type = None
- if 'detected_document_type' in result and result['detected_document_type']:
- doc_type = result['detected_document_type'].capitalize()
- elif 'topics' in result and result['topics']:
- # Use first topic as document type if not explicitly detected
- doc_type = result['topics'][0]
-
- # Find period/era information
- period_info = None
- if 'topics' in result and result['topics']:
- for tag in result['topics']:
- if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
- period_info = tag
- break
-
- # Format metadata within parentheses if available
- metadata = []
- if doc_type:
- metadata.append(doc_type)
- if period_info:
- metadata.append(period_info)
-
- metadata_str = ""
- if metadata:
- metadata_str = f" ({', '.join(metadata)})"
-
- # Add current date for uniqueness and sorting
- current_date = format_timestamp(for_filename=True)
- date_str = f" - {current_date}"
-
- # Generate final user-friendly filename
- descriptive_name = f"{readable_name}{metadata_str}{date_str}{file_ext}"
- return descriptive_name
-
-def extract_subject_tags(result, raw_text, preprocessing_options=None):
- """
- Extract subject tags from OCR result
-
- Args:
- result: OCR result dictionary
- raw_text: Raw text from OCR
- preprocessing_options: Dictionary of preprocessing options
-
- Returns:
- list: Subject tags
- """
- subject_tags = []
-
- try:
- # Use existing topics as starting point if available
- if 'topics' in result and result['topics']:
- subject_tags = list(result['topics'])
-
- # Add document type if detected
- if 'detected_document_type' in result:
- doc_type = result['detected_document_type'].capitalize()
- if doc_type not in subject_tags:
- subject_tags.append(doc_type)
-
- # Analyze content for common themes based on keywords
- if raw_text:
- raw_text_lower = raw_text.lower()
-
- # Track keyword matches for each theme and their frequency
- theme_matches = {}
-
- # First pass - find all matching keywords for each theme
- for theme, keywords in CONTENT_THEMES.items():
- matches = []
- for keyword in keywords:
- # For multi-word keywords, we want exact phrase matching
- if " " in keyword:
- if keyword in raw_text_lower:
- matches.append(keyword)
- # For single-word keywords, we want word boundary matching to avoid partial matches
- else:
- import re
- pattern = r'\b' + re.escape(keyword) + r'\b'
- if re.search(pattern, raw_text_lower):
- matches.append(keyword)
-
- if matches:
- # Store both the matches and their count
- theme_matches[theme] = {
- "matches": matches,
- "count": len(matches)
- }
-
- # Sort themes by match count in descending order
- sorted_themes = sorted(theme_matches.keys(),
- key=lambda t: theme_matches[t]["count"],
- reverse=True)
-
- # Add the most relevant themes (more matches = more relevant)
- # Limit to top 5 themes to avoid too many irrelevant tags
- top_themes = sorted_themes[:5] if len(sorted_themes) > 5 else sorted_themes
-
- # Add historical period tags first (they're often most important for historical research)
- period_themes = [t for t in top_themes if t in [
- "Prehistoric", "Ancient World", "Medieval", "Renaissance",
- "Early Modern", "18th Century", "19th Century", "20th Century", "Contemporary"
- ]]
-
- for theme in period_themes:
- if theme not in subject_tags:
- subject_tags.append(theme)
-
- # Then add the remaining top themes
- for theme in top_themes:
- if theme not in period_themes and theme not in subject_tags:
- subject_tags.append(theme)
-
- # Add debug information to log
- if theme_matches:
- logger.info(f"Extracted themes: {', '.join(top_themes)}")
- logger.info(f"Theme match details: {theme_matches}")
-
- # Add document period tag if date patterns are detected
- if raw_text:
- # Look for years in content
- import re
- year_matches = re.findall(r'\b1[0-9]{3}\b|\b20[0-1][0-9]\b', raw_text)
- if year_matches:
- # Convert to integers
- years = [int(y) for y in year_matches]
- # Get earliest year
- earliest = min(years)
-
- # Find the period tag for this year
- for year_range, period_tag in PERIOD_TAGS.items():
- if year_range[0] <= earliest <= year_range[1]:
- if period_tag not in subject_tags:
- subject_tags.append(period_tag)
- break
-
- # Add languages as topics if available
- if 'languages' in result and result['languages']:
- for lang in result['languages']:
- if lang and lang not in subject_tags:
- lang_tag = f"{lang} Language"
- subject_tags.append(lang_tag)
-
- # Add preprocessing information as tags if preprocessing was applied
- if preprocessing_options:
- preprocessing_methods = []
- if preprocessing_options.get("document_type", "standard") != "standard":
- doc_type = preprocessing_options["document_type"].capitalize()
- preprocessing_tag = f"Enhanced ({doc_type})"
- if preprocessing_tag not in subject_tags:
- subject_tags.append(preprocessing_tag)
-
- if preprocessing_options.get("grayscale", False):
- preprocessing_methods.append("Grayscale")
- if preprocessing_options.get("denoise", False):
- preprocessing_methods.append("Denoised")
- if preprocessing_options.get("contrast", 0) != 0:
- contrast_val = preprocessing_options.get("contrast", 0)
- if contrast_val > 0:
- preprocessing_methods.append("Contrast Enhanced")
- else:
- preprocessing_methods.append("Contrast Reduced")
- if preprocessing_options.get("rotation", 0) != 0:
- preprocessing_methods.append("Rotated")
-
- # Add a combined preprocessing tag if methods were applied
- if preprocessing_methods:
- prep_tag = "Preprocessed"
- if prep_tag not in subject_tags:
- subject_tags.append(prep_tag)
-
- # Add the specific method as a tag if only one was used
- if len(preprocessing_methods) == 1:
- method_tag = preprocessing_methods[0]
- if method_tag not in subject_tags:
- subject_tags.append(method_tag)
-
- except Exception as e:
- logger.warning(f"Error generating subject tags: {str(e)}")
- # Fallback tags if extraction fails
- if not subject_tags:
- subject_tags = DEFAULT_TAGS.copy()
-
- # Ensure we have at least 3 tags
- while len(subject_tags) < 3:
- for tag in DEFAULT_TAGS:
- if tag not in subject_tags:
- subject_tags.append(tag)
- break
- else:
- # If all default tags are already used, add generic ones
- for tag in GENERIC_TAGS:
- if tag not in subject_tags:
- subject_tags.append(tag)
- break
- else:
- # If we still can't add any more tags, break the loop
- break
-
- return subject_tags
diff --git a/utils/README.md b/utils/README.md
deleted file mode 100644
index 18ced9fb79c5c151ce633a478a1a9a9879455c2a..0000000000000000000000000000000000000000
--- a/utils/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# OCR Utilities
-
-This directory contains utility modules for the Historical OCR project.
-
-## PDF OCR Processing
-
-The `pdf_ocr.py` module provides specialized functionality for processing PDF documents with OCR.
-
-### Features
-
-- **Robust PDF-to-Image Conversion**: Converts PDF documents to images using optimized settings before OCR processing
-- **Multi-Page Support**: Intelligently handles multi-page documents, allowing processing of specific pages or page ranges
-- **Memory-Efficient Processing**: Processes PDFs in batches to prevent memory issues with large documents
-- **Fallback Mechanism**: Falls back to structured_ocr's internal processing if direct conversion fails
-- **Cleanup Management**: Automatically cleans up temporary files after processing
-
-### Key Components
-
-- **PDFOCR**: Main class for processing PDF files with OCR
-- **PDFConversionResult**: Helper class that holds PDF conversion results and manages cleanup
-
-### Basic Usage
-
-```python
-from utils.pdf_ocr import PDFOCR
-
-# Initialize the processor
-processor = PDFOCR()
-
-# Process a PDF file (all pages, with vision model)
-result = processor.process_pdf('document.pdf')
-
-# Process a PDF file (specific pages, with vision model)
-result = processor.process_pdf('document.pdf', custom_pages=[1, 3, 5])
-
-# Process a PDF file (first N pages, without vision model)
-result = processor.process_pdf('document.pdf', max_pages=3, use_vision=False)
-
-# Process a PDF file with custom prompt
-result = processor.process_pdf(
- 'document.pdf',
- custom_prompt="This is a historical newspaper with multiple columns."
-)
-
-# Save results to JSON
-output_path = processor.save_json_output('document.pdf', 'results.json')
-```
-
-### Command Line Usage
-
-The module can also be used directly from the command line:
-
-```bash
-python utils/pdf_ocr.py document.pdf --output results.json
-python utils/pdf_ocr.py document.pdf --max-pages 3
-python utils/pdf_ocr.py document.pdf --pages 1,3,5
-python utils/pdf_ocr.py document.pdf --prompt "This is a historical newspaper with multiple columns."
-python utils/pdf_ocr.py document.pdf --no-vision
-```
-
-### How It Works
-
-1. The module first attempts to convert the PDF to images using `pdf2image`
-2. It processes the first page with the vision model (if requested) for detailed analysis
-3. Additional pages are processed with the text model for efficiency
-4. All text is combined into a single result with appropriate metadata
-5. If direct conversion fails, it falls back to using `structured_ocr.py` for PDF processing
-
-### Parameters
-
-- **pdf_path**: Path to the PDF file to process
-- **use_vision**: Whether to use vision model for improved analysis (default: True)
-- **max_pages**: Maximum number of pages to process (default: all pages)
-- **custom_pages**: Specific page numbers to process, 1-based indexing (e.g., [1, 3, 5])
-- **custom_prompt**: Custom instructions for OCR processing
diff --git a/utils/__init__.py b/utils/__init__.py
deleted file mode 100644
index cc0b6de0413010eeae9b9eb35209025452086798..0000000000000000000000000000000000000000
--- a/utils/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""
-Utility functions for historical OCR processing.
-"""
-# Re-export image utilities
-from utils.image_utils import replace_images_in_markdown, get_combined_markdown, detect_skew, clean_ocr_result
-
-# Import general utilities from the new module
-from utils.general_utils import (
- generate_cache_key,
- timing,
- format_timestamp,
- create_descriptive_filename,
- extract_subject_tags
-)
-
-# Import file utilities
-from utils.file_utils import (
- get_base64_from_image,
- get_base64_from_bytes,
- handle_temp_files
-)
-
-# Import UI utilities
-from utils.ui_utils import display_results
-
-__all__ = [
- # Image utilities
- 'replace_images_in_markdown',
- 'get_combined_markdown',
- 'detect_skew',
- 'clean_ocr_result',
-
- # General utilities
- 'generate_cache_key',
- 'timing',
- 'format_timestamp',
- 'create_descriptive_filename',
- 'extract_subject_tags',
-
- # File utilities
- 'get_base64_from_image',
- 'get_base64_from_bytes',
- 'handle_temp_files',
-
- # UI utilities
- 'display_results'
-]
diff --git a/utils/content_utils.py b/utils/content_utils.py
deleted file mode 100644
index 3615adc17dc39a0365338833288a4d8186cc2615..0000000000000000000000000000000000000000
--- a/utils/content_utils.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import re
-import ast
-from .text_utils import clean_raw_text, format_markdown_text
-
-def classify_document_content(result):
- """Classify document content based on structure and content"""
- classification = {
- 'has_title': False,
- 'has_content': False,
- 'has_sections': False,
- 'is_structured': False
- }
-
- if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict):
- return classification
-
- # Check for title
- if 'title' in result['ocr_contents'] and result['ocr_contents']['title']:
- classification['has_title'] = True
-
- # Check for content
- content_fields = ['content', 'transcript', 'text']
- for field in content_fields:
- if field in result['ocr_contents'] and result['ocr_contents'][field]:
- classification['has_content'] = True
- break
-
- # Check for sections
- section_count = 0
- for key in result['ocr_contents'].keys():
- if key not in ['raw_text', 'error'] and result['ocr_contents'][key]:
- section_count += 1
-
- classification['has_sections'] = section_count > 2
-
- # Check if structured
- classification['is_structured'] = (
- classification['has_title'] and
- classification['has_content'] and
- classification['has_sections']
- )
-
- return classification
-
-def extract_document_text(result):
- """Extract main document text content"""
- if 'ocr_contents' not in result or not isinstance(result['ocr_contents'], dict):
- return ""
-
- # Try to get the text from content fields in preferred order - prioritize main_text
- for field in ['main_text', 'content', 'transcript', 'text', 'raw_text']:
- if field in result['ocr_contents'] and result['ocr_contents'][field]:
- content = result['ocr_contents'][field]
- if isinstance(content, str):
- return content
-
- return ""
-
-def extract_image_description(image_data):
- """Extract image description from data"""
- if not image_data or not isinstance(image_data, dict):
- return ""
-
- # Try different fields that might contain descriptions
- for field in ['alt_text', 'caption', 'description']:
- if field in image_data and image_data[field]:
- return image_data[field]
-
- return ""
-
-def format_structured_data(content):
- """Format structured data like lists and dictionaries into readable markdown
-
- Args:
- content: The content to format (str, list, dict)
-
- Returns:
- Formatted markdown text
- """
- if not content:
- return ""
-
- # For string content, return as-is to maintain content purity
- # This prevents JSON-like text from being transformed inappropriately
- if isinstance(content, str):
- return content
-
- # Handle native Python lists
- if isinstance(content, list):
- if not content:
- return ""
- # Convert to markdown bullet points
- return "\n".join([f"- {item}" for item in content])
-
- # Handle native Python dictionaries
- elif isinstance(content, dict):
- if not content:
- return ""
- # Convert to markdown key-value pairs
- return "\n".join([f"**{k}**: {v}" for k, v in content.items()])
-
- # Return as string for other types
- return str(content)
diff --git a/utils/file_utils.py b/utils/file_utils.py
deleted file mode 100644
index d4ee5b806a74abf3d64061081a0c15c8b4752c5d..0000000000000000000000000000000000000000
--- a/utils/file_utils.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""
-File utility functions for historical OCR processing.
-"""
-import base64
-import logging
-from pathlib import Path
-
-# Configure logging
-logger = logging.getLogger("utils")
-logger.setLevel(logging.INFO)
-
-def get_base64_from_image(image_path):
- """
- Get base64 data URL from image file with proper MIME type.
-
- Args:
- image_path: Path to the image file
-
- Returns:
- Base64 data URL with appropriate MIME type prefix
- """
- try:
- # Convert to Path object for better handling
- path_obj = Path(image_path)
-
- # Determine mime type based on file extension
- mime_type = 'image/jpeg' # Default mime type
- suffix = path_obj.suffix.lower()
- if suffix == '.png':
- mime_type = 'image/png'
- elif suffix == '.gif':
- mime_type = 'image/gif'
- elif suffix in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif suffix == '.pdf':
- mime_type = 'application/pdf'
-
- # Read and encode file
- with open(path_obj, "rb") as file:
- encoded = base64.b64encode(file.read()).decode('utf-8')
- return f"data:{mime_type};base64,{encoded}"
- except Exception as e:
- logger.error(f"Error encoding file to base64: {str(e)}")
- return ""
-
-def get_base64_from_bytes(file_bytes, mime_type=None, file_name=None):
- """
- Get base64 data URL from file bytes with proper MIME type.
-
- Args:
- file_bytes: Binary file data
- mime_type: MIME type of the file (optional)
- file_name: Original file name for MIME type detection (optional)
-
- Returns:
- Base64 data URL with appropriate MIME type prefix
- """
- try:
- # Determine mime type if not provided
- if mime_type is None and file_name is not None:
- # Get file extension
- suffix = Path(file_name).suffix.lower()
- if suffix == '.png':
- mime_type = 'image/png'
- elif suffix == '.gif':
- mime_type = 'image/gif'
- elif suffix in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif suffix == '.pdf':
- mime_type = 'application/pdf'
- else:
- # Default to image/jpeg for unknown types when processing images
- mime_type = 'image/jpeg'
- elif mime_type is None:
- # Default MIME type if we can't determine it - use image/jpeg instead of application/octet-stream
- # to ensure compatibility with Mistral AI OCR API
- mime_type = 'image/jpeg'
-
- # Encode and create data URL
- encoded = base64.b64encode(file_bytes).decode('utf-8')
- return f"data:{mime_type};base64,{encoded}"
- except Exception as e:
- logger.error(f"Error encoding bytes to base64: {str(e)}")
- return ""
-
-def handle_temp_files(temp_file_paths):
- """
- Clean up temporary files
-
- Args:
- temp_file_paths: List of temporary file paths to clean up
- """
- import os
- for temp_path in temp_file_paths:
- try:
- if os.path.exists(temp_path):
- os.unlink(temp_path)
- logger.info(f"Removed temporary file: {temp_path}")
- except Exception as e:
- logger.warning(f"Failed to remove temporary file {temp_path}: {str(e)}")
diff --git a/utils/general_utils.py b/utils/general_utils.py
deleted file mode 100644
index 76aebe89b9cf4a7697d71f6369533579d34f434f..0000000000000000000000000000000000000000
--- a/utils/general_utils.py
+++ /dev/null
@@ -1,198 +0,0 @@
-"""
-General utility functions for historical OCR processing.
-"""
-import os
-import base64
-import hashlib
-import time
-import logging
-from datetime import datetime
-from pathlib import Path
-from functools import wraps
-
-# Configure logging
-logger = logging.getLogger("utils")
-logger.setLevel(logging.INFO)
-
-def generate_cache_key(file_bytes, file_type, use_vision, preprocessing_options=None, pdf_rotation=0, custom_prompt=None):
- """
- Generate a cache key for OCR processing
-
- Args:
- file_bytes: File content as bytes
- file_type: Type of file (pdf or image)
- use_vision: Whether to use vision model
- preprocessing_options: Dictionary of preprocessing options
- pdf_rotation: PDF rotation value
- custom_prompt: Custom prompt for OCR
-
- Returns:
- str: Cache key
- """
- # Generate file hash
- file_hash = hashlib.md5(file_bytes).hexdigest()
-
- # Include preprocessing options in cache key
- preprocessing_options_hash = ""
- if preprocessing_options:
- # Add pdf_rotation to preprocessing options to ensure it's part of the cache key
- if pdf_rotation != 0:
- preprocessing_options_with_rotation = preprocessing_options.copy()
- preprocessing_options_with_rotation['pdf_rotation'] = pdf_rotation
- preprocessing_str = str(sorted(preprocessing_options_with_rotation.items()))
- else:
- preprocessing_str = str(sorted(preprocessing_options.items()))
- preprocessing_options_hash = hashlib.md5(preprocessing_str.encode()).hexdigest()
- elif pdf_rotation != 0:
- # If no preprocessing options but we have rotation, include that in the hash
- preprocessing_options_hash = hashlib.md5(f"pdf_rotation_{pdf_rotation}".encode()).hexdigest()
-
- # Create base cache key
- cache_key = f"{file_hash}_{file_type}_{use_vision}_{preprocessing_options_hash}"
-
- # Include custom prompt in cache key if provided
- if custom_prompt:
- custom_prompt_hash = hashlib.md5(str(custom_prompt).encode()).hexdigest()
- cache_key = f"{cache_key}_{custom_prompt_hash}"
-
- return cache_key
-
-def timing(description):
- """Context manager for timing code execution"""
- class TimingContext:
- def __init__(self, description):
- self.description = description
-
- def __enter__(self):
- self.start_time = time.time()
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- end_time = time.time()
- execution_time = end_time - self.start_time
- logger.info(f"{self.description} took {execution_time:.2f} seconds")
- return False
-
- return TimingContext(description)
-
-def format_timestamp(timestamp=None, for_filename=False):
- """
- Format timestamp for display or filenames
-
- Args:
- timestamp: Datetime object or string to format (defaults to current time)
- for_filename: Whether to format for use in a filename (defaults to False)
-
- Returns:
- str: Formatted timestamp
- """
- if timestamp is None:
- timestamp = datetime.now()
- elif isinstance(timestamp, str):
- try:
- timestamp = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
- except ValueError:
- timestamp = datetime.now()
-
- if for_filename:
- # Format suitable for filenames: "Apr 30, 2025"
- return timestamp.strftime("%b %d, %Y")
- else:
- # Standard format for display
- return timestamp.strftime("%Y-%m-%d %H:%M")
-
-def create_descriptive_filename(original_filename, result, file_ext, preprocessing_options=None):
- """
- Create a user-friendly descriptive filename for the result
-
- Args:
- original_filename: Original filename
- result: OCR result dictionary
- file_ext: File extension
- preprocessing_options: Dictionary of preprocessing options
-
- Returns:
- str: Human-readable descriptive filename
- """
- # Get base name without extension and capitalize words
- original_name = Path(original_filename).stem
-
- # Make the original name more readable by replacing dashes and underscores with spaces
- # Then capitalize each word
- readable_name = original_name.replace('-', ' ').replace('_', ' ')
- # Split by spaces and capitalize each word, then rejoin
- name_parts = readable_name.split()
- readable_name = ' '.join(word.capitalize() for word in name_parts)
-
- # Determine document type
- doc_type = None
- if 'detected_document_type' in result and result['detected_document_type']:
- doc_type = result['detected_document_type'].capitalize()
- elif 'topics' in result and result['topics']:
- # Use first topic as document type if not explicitly detected
- doc_type = result['topics'][0]
-
- # Find period/era information
- period_info = None
- if 'topics' in result and result['topics']:
- for tag in result['topics']:
- if "century" in tag.lower() or "pre-" in tag.lower() or "era" in tag.lower():
- period_info = tag
- break
-
- # Format metadata within parentheses if available
- metadata = []
- if doc_type:
- metadata.append(doc_type)
- if period_info:
- metadata.append(period_info)
-
- metadata_str = ""
- if metadata:
- metadata_str = f" ({', '.join(metadata)})"
-
- # Add current date for uniqueness and sorting
- current_date = format_timestamp(for_filename=True)
- date_str = f" - {current_date}"
-
- # Generate final user-friendly filename
- descriptive_name = f"{readable_name}{metadata_str}{date_str}{file_ext}"
- return descriptive_name
-
-def extract_subject_tags(result, raw_text, preprocessing_options=None):
- """
- Extract subject tags from OCR result
-
- Args:
- result: OCR result dictionary
- raw_text: Raw text from OCR
- preprocessing_options: Dictionary of preprocessing options
-
- Returns:
- list: Subject tags
- """
- subject_tags = []
-
- # Use existing topics as starting point if available
- if 'topics' in result and result['topics']:
- subject_tags = list(result['topics'])
-
- # Add document type if detected
- if 'detected_document_type' in result:
- doc_type = result['detected_document_type'].capitalize()
- if doc_type not in subject_tags:
- subject_tags.append(doc_type)
-
- # If no tags were found, add some defaults
- if not subject_tags:
- subject_tags = ["Document", "Historical Document"]
-
- # Try to infer content type
- if "letter" in raw_text.lower()[:1000] or "dear" in raw_text.lower()[:200]:
- subject_tags.append("Letter")
-
- # Check if it might be a newspaper
- if "newspaper" in raw_text.lower()[:1000] or "editor" in raw_text.lower()[:500]:
- subject_tags.append("Newspaper")
-
- return subject_tags
diff --git a/utils/helpers/__init__.py b/utils/helpers/__init__.py
deleted file mode 100644
index 5c3cf3ae0a0a5cda32fe7ec60311defd5e0afd70..0000000000000000000000000000000000000000
--- a/utils/helpers/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Helper modules for OCR processing utilities."""
\ No newline at end of file
diff --git a/utils/helpers/language_detection.py b/utils/helpers/language_detection.py
deleted file mode 100644
index 7667bb22513eed1d5ad556580edbddc79b1ef53b..0000000000000000000000000000000000000000
--- a/utils/helpers/language_detection.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Standard library imports
-import logging
-import re
-from typing import List, Dict, Set, Tuple, Optional, Union, Any
-from functools import lru_cache
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-class LanguageDetector:
- """
- A language detection system that provides balanced detection across multiple languages
- using an enhanced statistical approach.
- """
-
- def __init__(self):
- """Initialize the language detector with statistical language models"""
- logger.info("Initializing language detector with statistical models")
-
- # Initialize language indicators dictionary for statistical detection
- self._init_language_indicators()
- # Set thresholds for language detection confidence
- self.single_lang_confidence = 65 # Minimum score to consider a language detected
- self.secondary_lang_threshold = 0.75 # Secondary language must be at least this fraction of primary score
-
- def _init_language_indicators(self):
- """Initialize language indicators for statistical detection with historical markers"""
- # Define indicators for all supported languages with equal detail level
- # Each language has:
- # - Distinctive characters
- # - Common words (including historical forms)
- # - N-grams (character sequences)
- # - Historical markers specific to older forms of the language
- self.language_indicators = {
- "English": {
- "chars": [], # English uses basic Latin alphabet without special chars
- "words": ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it',
- 'with', 'as', 'be', 'on', 'by', 'at', 'this', 'have', 'from', 'or',
- 'an', 'but', 'not', 'what', 'all', 'were', 'when', 'we', 'there', 'can',
- 'would', 'who', 'you', 'been', 'one', 'their', 'has', 'more', 'if', 'no'],
- "ngrams": ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'ti', 'es', 'or',
- 'ing', 'tion', 'the', 'and', 'tha', 'ent', 'ion'],
- "historical": {
- "chars": ['þ', 'ȝ', 'æ', 'ſ'], # Thorn, yogh, ash, long s
- "words": ['thou', 'thee', 'thy', 'thine', 'hath', 'doth', 'ere', 'whilom', 'betwixt',
- 'ye', 'art', 'wast', 'dost', 'hast', 'shalt', 'mayst', 'verily'],
- "patterns": ['eth$', '^y[^a-z]', 'ck$', 'aught', 'ought'] # -eth endings, y- prefixes
- }
- },
- "French": {
- "chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û', 'ë', 'ï', 'ü'],
- "words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette',
- 'ces', 'dans', 'par', 'pour', 'sur', 'qui', 'que', 'quoi', 'où', 'quand', 'comment',
- 'est', 'sont', 'ont', 'nous', 'vous', 'ils', 'elles', 'avec', 'sans', 'mais', 'ou'],
- "ngrams": ['es', 'le', 'de', 'en', 'on', 'nt', 'qu', 'ai', 'an', 'ou', 'ur', 're', 'me',
- 'les', 'ent', 'que', 'des', 'ons', 'ant', 'ion'],
- "historical": {
- "chars": ['ſ', 'æ', 'œ'], # Long s and ligatures
- "words": ['aultre', 'avecq', 'icelluy', 'oncques', 'moult', 'estre', 'mesme', 'ceste',
- 'ledict', 'celuy', 'ceulx', 'aulcun', 'ainſi', 'touſiours', 'eſtre',
- 'eſt', 'meſme', 'felon', 'auec', 'iufques', 'chofe', 'fcience'],
- "patterns": ['oi[ts]$', 'oi[re]$', 'f[^aeiou]', 'ff', 'ſ', 'auoit', 'eſtoit',
- 'ſi', 'ſur', 'ſa', 'cy', 'ayant', 'oy', 'uſ', 'auſ']
- },
- },
- "German": {
- "chars": ['ä', 'ö', 'ü', 'ß'],
- "words": ['der', 'die', 'das', 'und', 'in', 'zu', 'den', 'ein', 'eine', 'mit', 'ist', 'von',
- 'des', 'sich', 'auf', 'für', 'als', 'auch', 'werden', 'bei', 'durch', 'aus', 'sind',
- 'nicht', 'nur', 'wurde', 'wie', 'wenn', 'aber', 'noch', 'nach', 'so', 'sein', 'über'],
- "ngrams": ['en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge', 'un', 'sch', 'ich',
- 'den', 'die', 'und', 'der', 'ein', 'ung', 'cht'],
- "historical": {
- "chars": ['ſ', 'ů', 'ė', 'ÿ'],
- "words": ['vnnd', 'vnnd', 'vnter', 'vnd', 'seyn', 'thun', 'auff', 'auß', 'deß', 'diß'],
- "patterns": ['^v[nd]', 'th', 'vnter', 'ſch']
- }
- },
- "Spanish": {
- "chars": ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', '¿', '¡'],
- "words": ['el', 'la', 'los', 'las', 'de', 'en', 'y', 'a', 'que', 'por', 'un', 'una', 'no',
- 'es', 'con', 'para', 'su', 'al', 'se', 'del', 'como', 'más', 'pero', 'lo', 'mi',
- 'si', 'ya', 'todo', 'esta', 'cuando', 'hay', 'muy', 'bien', 'sin', 'así'],
- "ngrams": ['de', 'en', 'os', 'es', 'la', 'ar', 'el', 'er', 'ra', 'as', 'an', 'do', 'or',
- 'que', 'nte', 'los', 'ado', 'con', 'ent', 'ien'],
- "historical": {
- "chars": ['ſ', 'ç', 'ñ'],
- "words": ['facer', 'fijo', 'fermoso', 'agora', 'asaz', 'aver', 'caſa', 'deſde', 'eſte',
- 'eſta', 'eſto', 'deſto', 'deſta', 'eſſo', 'muger', 'dixo', 'fazer'],
- "patterns": ['^f[aei]', 'ſſ', 'ſc', '^deſ', 'xo$', 'xe$']
- },
- },
- "Italian": {
- "chars": ['à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'],
- "words": ['il', 'la', 'i', 'le', 'e', 'di', 'a', 'in', 'che', 'non', 'per', 'con', 'un',
- 'una', 'del', 'della', 'è', 'sono', 'da', 'si', 'come', 'anche', 'più', 'ma', 'ci',
- 'se', 'ha', 'mi', 'lo', 'ti', 'al', 'tu', 'questo', 'questi'],
- "ngrams": ['di', 'la', 'er', 'to', 're', 'co', 'de', 'in', 'ra', 'on', 'li', 'no', 'ri',
- 'che', 'ent', 'con', 'per', 'ion', 'ato', 'lla']
- },
- "Portuguese": {
- "chars": ['á', 'â', 'ã', 'à', 'é', 'ê', 'í', 'ó', 'ô', 'õ', 'ú', 'ç'],
- "words": ['o', 'a', 'os', 'as', 'de', 'em', 'e', 'do', 'da', 'dos', 'das', 'no', 'na',
- 'para', 'que', 'um', 'uma', 'por', 'com', 'se', 'não', 'mais', 'como', 'mas',
- 'você', 'eu', 'este', 'isso', 'ele', 'seu', 'sua', 'ou', 'já', 'me'],
- "ngrams": ['de', 'os', 'em', 'ar', 'es', 'ra', 'do', 'da', 'en', 'co', 'nt', 'ad', 'to',
- 'que', 'nto', 'ent', 'com', 'ção', 'ado', 'ment']
- },
- "Dutch": {
- "chars": ['ë', 'ï', 'ö', 'ü', 'é', 'è', 'ê', 'ç', 'á', 'à', 'ä', 'ó', 'ô', 'ú', 'ù', 'û', 'ij'],
- "words": ['de', 'het', 'een', 'en', 'van', 'in', 'is', 'dat', 'op', 'te', 'zijn', 'met',
- 'voor', 'niet', 'aan', 'er', 'die', 'maar', 'dan', 'ik', 'je', 'hij', 'zij', 'we',
- 'kunnen', 'wordt', 'nog', 'door', 'over', 'als', 'uit', 'bij', 'om', 'ook'],
- "ngrams": ['en', 'de', 'er', 'ee', 'ge', 'an', 'aa', 'in', 'te', 'et', 'ng', 'ee', 'or',
- 'van', 'het', 'een', 'ing', 'ver', 'den', 'sch']
- },
- "Russian": {
- # Russian (Cyrillic alphabet) characters
- "chars": ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
- 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'],
- "words": ['и', 'в', 'не', 'на', 'что', 'я', 'с', 'а', 'то', 'он', 'как', 'этот', 'по',
- 'но', 'из', 'к', 'у', 'за', 'вы', 'все', 'так', 'же', 'от', 'для', 'о', 'его',
- 'мы', 'было', 'она', 'бы', 'мне', 'еще', 'есть', 'быть', 'был'],
- "ngrams": ['о', 'е', 'а', 'н', 'и', 'т', 'р', 'с', 'в', 'л', 'к', 'м', 'д',
- 'ст', 'но', 'то', 'ни', 'на', 'по', 'ет']
- },
- "Chinese": {
- "chars": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
- '个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就',
- '年', '生', '对', '能', '自', '那', '都', '得', '说', '过', '子', '家', '后', '多'],
- # Chinese doesn't have "words" in the same way as alphabetic languages
- "words": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
- '个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就'],
- "ngrams": ['的', '是', '不', '了', '在', '我', '有', '和', '人', '这', '中', '大', '来', '上',
- '国', '个', '到', '说', '们', '为']
- },
- "Japanese": {
- # A mix of hiragana, katakana, and common kanji
- "chars": ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ',
- 'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ', 'ク', 'ケ', 'コ', 'サ', 'シ', 'ス', 'セ', 'ソ',
- '日', '本', '人', '大', '小', '中', '山', '川', '田', '子', '女', '男', '月', '火', '水'],
- "words": ['は', 'を', 'に', 'の', 'が', 'で', 'へ', 'から', 'より', 'まで', 'だ', 'です', 'した',
- 'ます', 'ません', 'です', 'これ', 'それ', 'あれ', 'この', 'その', 'あの', 'わたし'],
- "ngrams": ['の', 'は', 'た', 'が', 'を', 'に', 'て', 'で', 'と', 'し', 'か', 'ま', 'こ', 'い',
- 'する', 'いる', 'れる', 'なる', 'れて', 'した']
- },
- "Korean": {
- "chars": ['가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
- '그', '는', '을', '이', '에', '에서', '로', '으로', '와', '과', '또는', '하지만'],
- "words": ['이', '그', '저', '나', '너', '우리', '그들', '이것', '그것', '저것', '은', '는',
- '이', '가', '을', '를', '에', '에서', '으로', '로', '와', '과', '의', '하다', '되다'],
- "ngrams": ['이', '다', '는', '에', '하', '고', '지', '서', '의', '가', '을', '로', '을', '으',
- '니다', '습니', '하는', '이다', '에서', '하고']
- },
- "Arabic": {
- "chars": ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض',
- 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ء', 'ة', 'ى'],
- "words": ['في', 'من', 'على', 'إلى', 'هذا', 'هذه', 'ذلك', 'تلك', 'هو', 'هي', 'هم', 'أنا',
- 'أنت', 'نحن', 'كان', 'كانت', 'يكون', 'لا', 'لم', 'ما', 'أن', 'و', 'أو', 'ثم', 'بعد'],
- "ngrams": ['ال', 'ان', 'في', 'من', 'ون', 'ين', 'ات', 'ار', 'ور', 'ما', 'لا', 'ها', 'ان',
- 'الم', 'لان', 'علا', 'الح', 'الس', 'الع', 'الت']
- },
- "Hindi": {
- "chars": ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ',
- 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
- 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी',
- 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्', 'ं', 'ः'],
- "words": ['और', 'का', 'के', 'की', 'एक', 'में', 'है', 'यह', 'हैं', 'से', 'को', 'पर', 'इस',
- 'हो', 'गया', 'कर', 'मैं', 'या', 'हुआ', 'था', 'वह', 'अपने', 'सकता', 'ने', 'बहुत'],
- "ngrams": ['का', 'के', 'की', 'है', 'ने', 'से', 'मे', 'को', 'पर', 'हा', 'रा', 'ता', 'या',
- 'ार', 'ान', 'कार', 'राज', 'ारा', 'जाए', 'ेजा']
- },
- "Latin": {
- "chars": [], # Latin uses basic Latin alphabet
- "words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod', 'ut', 'si',
- 'nec', 'ex', 'per', 'quam', 'pro', 'iam', 'hoc', 'aut', 'esse', 'enim', 'de',
- 'atque', 'ac', 'ante', 'post', 'sub', 'ab'],
- "ngrams": ['us', 'is', 'um', 'er', 'it', 'nt', 'am', 'em', 're', 'at', 'ti', 'es', 'ur',
- 'tur', 'que', 'ere', 'ent', 'ius', 'rum', 'tus']
- },
- "Greek": {
- "chars": ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
- 'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ά', 'έ', 'ή', 'ί', 'ό', 'ύ', 'ώ'],
- "words": ['και', 'του', 'της', 'των', 'στο', 'στη', 'με', 'από', 'για', 'είναι', 'να',
- 'ότι', 'δεν', 'στον', 'μια', 'που', 'ένα', 'έχει', 'θα', 'το', 'ο', 'η', 'τον'],
- "ngrams": ['αι', 'τα', 'ου', 'τη', 'οι', 'το', 'ης', 'αν', 'ος', 'ον', 'ις', 'ει', 'ερ',
- 'και', 'την', 'τον', 'ους', 'νου', 'εντ', 'μεν']
- }
- }
-
- def detect_languages(self, text: str, filename: str = None, current_languages: List[str] = None) -> List[str]:
- """
- Detect languages in text using an enhanced statistical approach
-
- Args:
- text: Text to analyze
- filename: Optional filename to provide additional context
- current_languages: Optional list of languages already detected
-
- Returns:
- List of detected languages
- """
- logger = logging.getLogger("language_detector")
-
- # If no text provided, return current languages or default
- if not text or len(text.strip()) < 10:
- return current_languages if current_languages else ["English"]
-
- # If we already have detected languages, use them
- if current_languages and len(current_languages) > 0:
- logger.info(f"Using already detected languages: {current_languages}")
- return current_languages
-
- # Use enhanced statistical detection
- detected_languages = self._detect_statistically(text, filename)
- logger.info(f"Statistical language detection results: {detected_languages}")
- return detected_languages
-
- def _detect_statistically(self, text: str, filename: str = None) -> List[str]:
- """
- Detect languages using enhanced statistical analysis with historical language indicators
-
- Args:
- text: Text to analyze
- filename: Optional filename for additional context
-
- Returns:
- List of detected languages
- """
- logger = logging.getLogger("language_detector")
-
- # Normalize text to lowercase for consistent analysis
- text_lower = text.lower()
- words = re.findall(r'\b\w+\b', text_lower) # Extract words
-
- # Score each language based on characters, words, n-grams, and historical markers
- language_scores = {}
- historical_bonus = {}
-
- # PHASE 1: Special character analysis
- # Count special characters for each language
- special_char_counts = {}
- total_special_chars = 0
-
- for language, indicators in self.language_indicators.items():
- chars = indicators["chars"]
- count = 0
- for char in chars:
- if char in text_lower:
- count += text_lower.count(char)
- special_char_counts[language] = count
- total_special_chars += count
-
- # Normalize character scores (0-30 points)
- for language, count in special_char_counts.items():
- if total_special_chars > 0:
- # Scale score to 0-30 range (reduced from 35 to make room for historical)
- normalized_score = (count / total_special_chars) * 30
- language_scores[language] = normalized_score
- else:
- language_scores[language] = 0
-
- # PHASE 2: Word analysis (0-30 points)
- # Count common words for each language
- for language, indicators in self.language_indicators.items():
- word_list = indicators["words"]
- word_matches = sum(1 for word in words if word in word_list)
-
- # Normalize word score based on text length and word list size
- word_score_factor = min(1.0, word_matches / (len(words) * 0.1)) # Max 1.0 if 10% match
- language_scores[language] = language_scores.get(language, 0) + (word_score_factor * 30)
-
- # PHASE 3: N-gram analysis (0-20 points)
- for language, indicators in self.language_indicators.items():
- ngram_list = indicators["ngrams"]
- ngram_matches = 0
-
- # Count ngram occurrences
- for ngram in ngram_list:
- ngram_matches += text_lower.count(ngram)
-
- # Normalize ngram score based on text length
- if len(text_lower) > 0:
- ngram_score_factor = min(1.0, ngram_matches / (len(text_lower) * 0.05)) # Max 1.0 if 5% match
- language_scores[language] = language_scores.get(language, 0) + (ngram_score_factor * 20)
-
- # PHASE 4: Historical language markers (0-20 points)
- for language, indicators in self.language_indicators.items():
- if "historical" in indicators:
- historical_indicators = indicators["historical"]
- historical_score = 0
-
- # Check for historical chars
- if "chars" in historical_indicators:
- for char in historical_indicators["chars"]:
- if char in text_lower:
- historical_score += text_lower.count(char) * 0.5
-
- # Check for historical words
- if "words" in historical_indicators:
- hist_words = historical_indicators["words"]
- hist_word_matches = sum(1 for word in words if word in hist_words)
- if hist_word_matches > 0:
- # Historical words are strong indicators
- historical_score += min(10, hist_word_matches * 2)
-
- # Check for historical patterns
- if "patterns" in historical_indicators:
- for pattern in historical_indicators["patterns"]:
- matches = len(re.findall(pattern, text_lower))
- if matches > 0:
- historical_score += min(5, matches * 0.5)
-
- # Cap historical score at 20 points
- historical_score = min(20, historical_score)
- historical_bonus[language] = historical_score
-
- # Apply historical bonus
- language_scores[language] += historical_score
-
- # Apply language-specific exclusivity multiplier if present
- if "exclusivity" in indicators:
- exclusivity = indicators["exclusivity"]
- language_scores[language] *= exclusivity
- logger.info(f"Applied exclusivity multiplier {exclusivity} to {language}")
-
- # Print historical bonus for debugging
- for language, bonus in historical_bonus.items():
- if bonus > 0:
- logger.info(f"Historical language bonus for {language}: {bonus} points")
-
- # Final language selection with more stringent criteria
- # Get languages with scores above threshold
- threshold = self.single_lang_confidence # Higher minimum score
- candidates = [(lang, score) for lang, score in language_scores.items() if score >= threshold]
- candidates.sort(key=lambda x: x[1], reverse=True)
-
- logger.info(f"Language candidates: {candidates}")
-
- # If we have candidate languages, return top 1-2 with higher threshold for secondary
- if candidates:
- # Always take top language
- result = [candidates[0][0]]
-
- # Add second language only if it's significantly strong compared to primary
- # and doesn't have a historical/exclusivity conflict
- if len(candidates) > 1:
- primary_lang = candidates[0][0]
- secondary_lang = candidates[1][0]
- primary_score = candidates[0][1]
- secondary_score = candidates[1][1]
-
- # Only add secondary if it meets threshold and doesn't conflict
- ratio = secondary_score / primary_score
-
- # Check for French and Spanish conflict (historical French often gets misidentified)
- historical_conflict = False
- if (primary_lang == "French" and secondary_lang == "Spanish" and
- historical_bonus.get("French", 0) > 5):
- historical_conflict = True
- logger.info("Historical French markers detected, suppressing Spanish detection")
-
- if ratio >= self.secondary_lang_threshold and not historical_conflict:
- result.append(secondary_lang)
- logger.info(f"Added secondary language {secondary_lang} (score ratio: {ratio:.2f})")
- else:
- logger.info(f"Rejected secondary language {secondary_lang} (score ratio: {ratio:.2f})")
-
- return result
-
- # Default to English if no clear signals
diff --git a/utils/helpers/letterhead_handler.py b/utils/helpers/letterhead_handler.py
deleted file mode 100644
index 8d3f539298a5a51b2d3d169bb46dbd1f38ef4c04..0000000000000000000000000000000000000000
--- a/utils/helpers/letterhead_handler.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Standard library imports
-import os
-import logging
-from pathlib import Path
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-def is_likely_letterhead(file_path, features=None):
- """
- Determine if a document is likely to contain letterhead or marginalia
-
- Args:
- file_path: Path to the document image
- features: Optional dictionary of pre-extracted features like text density
-
- Returns:
- bool: True if the document likely contains letterhead, False otherwise
- """
- # Simple logic based on filename for initial version
- file_name = Path(file_path).name.lower()
- letterhead_indicators = ['letter', 'letterhead', 'correspondence', 'memo']
-
- # Check filename for indicators
- for indicator in letterhead_indicators:
- if indicator in file_name:
- logger.info(f"Letterhead detected based on filename: {file_name}")
- return True
-
- # Check features if provided
- if features:
- # High text density at the top of the document may indicate letterhead
- if 'top_density' in features and features['top_density'] > 0.5:
- logger.info(f"Letterhead detected based on top text density: {features['top_density']}")
- return True
-
- # Uneven text distribution may indicate marginalia
- if 'density_variance' in features and features['density_variance'] > 0.3:
- logger.info(f"Possible marginalia detected based on text density variance")
- return True
-
- # Default to standard document
- return False
-
-def get_letterhead_prompt(file_path, features=None):
- """
- Generate a specialized prompt for letterhead document OCR
-
- Args:
- file_path: Path to the document image
- features: Optional dictionary of pre-extracted features
-
- Returns:
- str: Specialized prompt for letterhead document OCR
- """
- # Base prompt for all letterhead documents
- base_prompt = ("This document appears to be a letter or includes letterhead elements. "
- "Please extract the following components separately if present:\n"
- "1. Letterhead (header with logo, organization name, address, etc.)\n"
- "2. Date\n"
- "3. Recipient information (address, name, title)\n"
- "4. Salutation (e.g., 'Dear Sir/Madam')\n"
- "5. Main body text\n"
- "6. Closing (e.g., 'Sincerely')\n"
- "7. Signature\n"
- "8. Any footnotes, marginalia, or annotations\n\n"
- "Preserve the original formatting and structure as much as possible.")
-
- # Enhanced prompts based on features
- if features:
- # Extract additional context from features if available
- if 'is_historical' in features and features['is_historical']:
- base_prompt += ("\n\nThis appears to be a historical document. Pay special attention to older "
- "letterhead styles, formal language patterns, and period-specific formatting.")
-
- if 'has_marginalia' in features and features['has_marginalia']:
- base_prompt += ("\n\nThe document contains marginalia or handwritten notes in the margins. "
- "Please extract these separately from the main text and indicate their position.")
-
- return base_prompt
diff --git a/utils/helpers/ocr_text_repair.py b/utils/helpers/ocr_text_repair.py
deleted file mode 100644
index 488e7ee577ef20a531ae6a83425e357bba2b646f..0000000000000000000000000000000000000000
--- a/utils/helpers/ocr_text_repair.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Standard library imports
-import re
-import logging
-from difflib import SequenceMatcher
-from typing import Tuple, Dict, Any, List, Optional
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-def detect_duplicate_text_issues(text: str) -> Tuple[bool, Dict[str, Any]]:
- """
- Detect if OCR text has duplication issues often found in handwritten document OCR
-
- Args:
- text: OCR text to analyze
-
- Returns:
- Tuple of (has_duplication_issues, details_dict)
- """
- # Early exit for empty text
- if not text or len(text) < 100:
- return False, {"duplication_rate": 0.0, "details": "Text too short for analysis"}
-
- # Look for repeated line patterns
- lines = text.split('\n')
- line_count = len(lines)
-
- # Basic metrics
- repeated_lines = 0
- duplicate_sections = []
- line_repetition_indices = []
-
- # Check for exact line repetitions
- seen_lines = {}
- for i, line in enumerate(lines):
- # Skip very short lines or empty lines
- stripped = line.strip()
- if len(stripped) < 5:
- continue
-
- if stripped in seen_lines:
- repeated_lines += 1
- line_repetition_indices.append((seen_lines[stripped], i))
- else:
- seen_lines[stripped] = i
-
- # Calculate line repetition rate
- line_repetition_rate = repeated_lines / max(1, line_count)
-
- # Look for longer repeated sections using sequence matcher
- text_blocks = [text[i:i+100] for i in range(0, len(text), 100) if i+100 <= len(text)]
- block_count = len(text_blocks)
-
- repeated_blocks = 0
- for i in range(block_count):
- for j in range(i+1, min(i+10, block_count)): # Only check nearby blocks for efficiency
- matcher = SequenceMatcher(None, text_blocks[i], text_blocks[j])
- similarity = matcher.ratio()
- if similarity > 0.8: # High similarity threshold
- repeated_blocks += 1
- duplicate_sections.append((i, j, similarity))
- break
-
- # Calculate block repetition rate
- block_repetition_rate = repeated_blocks / max(1, block_count)
-
- # Combine metrics for overall duplication rate
- duplication_rate = max(line_repetition_rate, block_repetition_rate)
-
- # Detect patterns of repeated words in sequence (common OCR mistake)
- word_pattern = r'\b(\w+)\s+\1\b'
- repeated_words = len(re.findall(word_pattern, text))
- repeated_words_rate = repeated_words / max(1, len(text.split()))
-
- # Update duplication rate with word repetition
- duplication_rate = max(duplication_rate, repeated_words_rate)
-
- # Log detailed analysis
- logger.info(f"OCR duplication analysis: line_repetition={line_repetition_rate:.2f}, "
- f"block_repetition={block_repetition_rate:.2f}, "
- f"word_repetition={repeated_words_rate:.2f}, "
- f"final_rate={duplication_rate:.2f}")
-
- # Determine if this is a serious issue
- has_duplication = duplication_rate > 0.1
-
- # Return detailed results
- return has_duplication, {
- "duplication_rate": duplication_rate,
- "line_repetition_rate": line_repetition_rate,
- "block_repetition_rate": block_repetition_rate,
- "word_repetition_rate": repeated_words_rate,
- "repeated_lines": repeated_lines,
- "repeated_blocks": repeated_blocks,
- "repeated_words": repeated_words,
- "duplicate_sections": duplicate_sections[:10], # Only include the first 10 for brevity
- "repetition_indices": line_repetition_indices[:10]
- }
-
-def get_enhanced_preprocessing_options(current_options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
- """
- Generate enhanced preprocessing options for improved OCR on handwritten documents
-
- Args:
- current_options: Current preprocessing options (if available)
-
- Returns:
- Dict of enhanced options
- """
- # Start with current options or empty dict
- options = current_options.copy() if current_options else {}
-
- # Set document type to handwritten
- options["document_type"] = "handwritten"
-
- # Enhanced contrast - higher than normal for better handwriting extraction
- options["contrast"] = 1.4 # Higher than default
-
- # Apply grayscale
- options["grayscale"] = True
-
- # Apply adaptive thresholding optimized for handwriting
- options["adaptive_threshold"] = True
- options["threshold_block_size"] = 25 # Larger block size for handwriting
- options["threshold_c"] = 10 # Adjusted C value for better handwriting detection
-
- # Disable standard binarization which often loses handwriting detail
- options["binarize"] = False
-
- # Despeckle to reduce noise
- options["denoise"] = True
-
- # Enable handwriting-specific preprocessing
- options["handwriting_mode"] = True
-
- # Disable anything that might harm handwriting recognition
- if "sharpen" in options:
- options["sharpen"] = False
-
- logger.info(f"Enhanced handwriting preprocessing options generated: {options}")
- return options
-
-def get_handwritten_specific_prompt(current_prompt: Optional[str] = None) -> str:
- """
- Generate a specialized prompt for handwritten document OCR
-
- Args:
- current_prompt: Current prompt (if available)
-
- Returns:
- str: Enhanced prompt for handwritten documents
- """
- # Base prompt for all handwritten documents
- base_prompt = ("This is a handwritten document that requires careful transcription. "
- "Please transcribe all visible handwritten text, preserving the original "
- "line breaks, paragraph structure, and any special formatting or indentation. "
- "Pay special attention to:\n"
- "1. Words that may be difficult to read due to handwriting style\n"
- "2. Any crossed-out text (indicate with [crossed out: possible text])\n"
- "3. Insertions or annotations between lines or in margins\n"
- "4. Maintain the spatial layout of the text as much as possible\n"
- "5. If there are multiple columns or non-linear text, preserve the reading order\n\n"
- "If you cannot read a word with confidence, indicate with [?] or provide your best guess as [word?].")
-
- # If there's an existing prompt, combine them, otherwise just use the base
- if current_prompt:
- # Remove any redundant instructions about handwriting
- lower_prompt = current_prompt.lower()
- if "handwritten" in lower_prompt or "handwriting" in lower_prompt:
- # Extract any unique instructions from the current prompt
- # This logic is simplified and might need improvement
- current_sentences = [s.strip() for s in current_prompt.split('.') if s.strip()]
- handwriting_sentences = [s for s in current_sentences
- if "handwritten" not in s.lower()
- and "handwriting" not in s.lower()]
-
- # Add unique instructions to our base prompt
- if handwriting_sentences:
- combined_prompt = base_prompt + "\n\nAdditional instructions:\n"
- combined_prompt += ". ".join(handwriting_sentences) + "."
- return combined_prompt
- else:
- # If no handwriting instructions in the current prompt, just append it
- return f"{base_prompt}\n\nAdditional context from user:\n{current_prompt}"
-
- return base_prompt
-
-def clean_duplicated_text(text: str) -> str:
- """
- Clean up duplicated text often found in OCR output for handwritten documents
-
- Args:
- text: OCR text to clean
-
- Returns:
- str: Cleaned text with duplications removed
- """
- if not text:
- return text
-
- # Split into lines for line-based deduplication
- lines = text.split('\n')
-
- # Remove consecutive duplicate lines
- deduped_lines = []
- prev_line = None
-
- for line in lines:
- stripped = line.strip()
- # Skip empty lines
- if not stripped:
- if not deduped_lines or deduped_lines[-1].strip():
- deduped_lines.append(line) # Keep the first empty line
- continue
-
- # Skip if this line is a duplicate of the previous line
- if stripped == prev_line:
- continue
-
- deduped_lines.append(line)
- prev_line = stripped
-
- # Re-join the deduplicated lines
- deduped_text = '\n'.join(deduped_lines)
-
- # Remove repeated words
- word_pattern = r'\b(\w+)\s+\1\b'
- deduped_text = re.sub(word_pattern, r'\1', deduped_text)
-
- # Remove repeated phrases (3+ words)
- # This is a simplified approach and might need improvement
- words = deduped_text.split()
- cleaned_words = []
- i = 0
-
- while i < len(words):
- # Check for phrase repetition (phrases of 3 to 6 words)
- found_repeat = False
-
- for phrase_len in range(3, min(7, len(words) - i)):
- phrase = ' '.join(words[i:i+phrase_len])
- next_pos = i + phrase_len
-
- if next_pos + phrase_len <= len(words):
- next_phrase = ' '.join(words[next_pos:next_pos+phrase_len])
-
- if phrase.lower() == next_phrase.lower():
- # Found a repeated phrase, skip the second occurrence
- cleaned_words.extend(words[i:i+phrase_len])
- i = next_pos + phrase_len
- found_repeat = True
- break
-
- if not found_repeat:
- cleaned_words.append(words[i])
- i += 1
-
- # Rejoin the cleaned words
- final_text = ' '.join(cleaned_words)
-
- # Log the cleaning results
- original_len = len(text)
- cleaned_len = len(final_text)
- reduction = 100 * (original_len - cleaned_len) / max(1, original_len)
-
- logger.info(f"Text cleaning: removed {original_len - cleaned_len} chars ({reduction:.1f}% reduction)")
-
- return final_text
diff --git a/utils/image_utils.py b/utils/image_utils.py
deleted file mode 100644
index 9147961442195e4cd1b940560231c2ff7126947e..0000000000000000000000000000000000000000
--- a/utils/image_utils.py
+++ /dev/null
@@ -1,1266 +0,0 @@
-"""
-Utility functions for OCR image processing with Mistral AI.
-Contains helper functions for working with OCR responses and image handling.
-"""
-
-# Standard library imports
-import json
-import base64
-import io
-import zipfile
-import logging
-import re
-import time
-import math
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional, Union, Any, Tuple
-from functools import lru_cache
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-# Third-party imports
-import numpy as np
-
-# Mistral AI imports
-from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
-from mistralai.models import OCRImageObject
-
-# Check for image processing libraries
-try:
- from PIL import Image, ImageEnhance, ImageFilter, ImageOps
- PILLOW_AVAILABLE = True
-except ImportError:
- logger.warning("PIL not available - image preprocessing will be limited")
- PILLOW_AVAILABLE = False
-
-try:
- import cv2
- CV2_AVAILABLE = True
-except ImportError:
- logger.warning("OpenCV (cv2) not available - advanced image processing will be limited")
- CV2_AVAILABLE = False
-
-# Import configuration
-try:
- from config import IMAGE_PREPROCESSING
-except ImportError:
- # Fallback defaults if config not available
- IMAGE_PREPROCESSING = {
- "enhance_contrast": 1.5,
- "sharpen": True,
- "denoise": True,
- "max_size_mb": 8.0,
- "target_dpi": 300,
- "compression_quality": 92
- }
-
-def detect_skew(image: Union[Image.Image, np.ndarray]) -> float:
- """
- Quick skew detection that returns angle in degrees.
- Uses a computationally efficient approach by analyzing at 1% resolution.
-
- Args:
- image: PIL Image or numpy array
-
- Returns:
- Estimated skew angle in degrees (positive or negative)
- """
- # Convert PIL Image to numpy array if needed
- if isinstance(image, Image.Image):
- # Convert to grayscale for processing
- if image.mode != 'L':
- img_np = np.array(image.convert('L'))
- else:
- img_np = np.array(image)
- else:
- # If already numpy array, ensure it's grayscale
- if len(image.shape) == 3:
- if CV2_AVAILABLE:
- img_np = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
- else:
- # Fallback grayscale conversion
- img_np = np.mean(image, axis=2).astype(np.uint8)
- else:
- img_np = image
-
- # Downsample to 1% resolution for faster processing
- height, width = img_np.shape
- target_size = int(min(width, height) * 0.01)
-
- # Use a sane minimum size and ensure we have enough pixels to detect lines
- target_size = max(target_size, 100)
-
- if CV2_AVAILABLE:
- # OpenCV-based implementation (faster)
- # Resize the image to the target size
- scale_factor = target_size / max(width, height)
- small_img = cv2.resize(img_np, None, fx=scale_factor, fy=scale_factor, interpolation=cv2.INTER_AREA)
-
- # Apply binary thresholding to get cleaner edges
- _, binary = cv2.threshold(small_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- # Use Hough Line Transform to detect lines
- lines = cv2.HoughLinesP(binary, 1, np.pi/180, threshold=target_size//10,
- minLineLength=target_size//5, maxLineGap=target_size//10)
-
- if lines is None or len(lines) < 3:
- # Not enough lines detected, assume no significant skew
- return 0.0
-
- # Calculate angles of lines
- angles = []
- for line in lines:
- x1, y1, x2, y2 = line[0]
- if x2 - x1 == 0: # Avoid division by zero
- continue
- angle = math.atan2(y2 - y1, x2 - x1) * 180.0 / np.pi
-
- # Normalize angle to -45 to 45 range
- angle = angle % 180
- if angle > 90:
- angle -= 180
- if angle > 45:
- angle -= 90
- if angle < -45:
- angle += 90
-
- angles.append(angle)
-
- if not angles:
- return 0.0
-
- # Use median to reduce impact of outliers
- angles.sort()
- median_angle = angles[len(angles) // 2]
-
- return median_angle
- else:
- # PIL-only fallback implementation
- # Resize using PIL
- small_img = Image.fromarray(img_np).resize(
- (int(width * target_size / max(width, height)),
- int(height * target_size / max(width, height))),
- Image.NEAREST
- )
-
- # Find edges
- edges = small_img.filter(ImageFilter.FIND_EDGES)
- edges_data = np.array(edges)
-
- # Simple edge orientation analysis (less precise than OpenCV)
- # Count horizontal vs vertical edges
- h_edges = np.sum(np.abs(np.diff(edges_data, axis=1)))
- v_edges = np.sum(np.abs(np.diff(edges_data, axis=0)))
-
- # If horizontal edges dominate, no significant skew
- if h_edges > v_edges * 1.2:
- return 0.0
-
- # Simple angle estimation based on edge distribution
- # This is a simplified approach that works for slight skews
- rows, cols = edges_data.shape
- xs, ys = [], []
-
- # Sample strong edge points
- for r in range(0, rows, 2):
- for c in range(0, cols, 2):
- if edges_data[r, c] > 128:
- xs.append(c)
- ys.append(r)
-
- if len(xs) < 10: # Not enough edge points
- return 0.0
-
- # Use simple linear regression to estimate the slope
- n = len(xs)
- mean_x = sum(xs) / n
- mean_y = sum(ys) / n
-
- # Calculate slope
- numerator = sum((xs[i] - mean_x) * (ys[i] - mean_y) for i in range(n))
- denominator = sum((xs[i] - mean_x) ** 2 for i in range(n))
-
- if abs(denominator) < 1e-6: # Avoid division by zero
- return 0.0
-
- slope = numerator / denominator
- angle = math.atan(slope) * 180.0 / math.pi
-
- # Normalize to -45 to 45 degrees
- if angle > 45:
- angle -= 90
- elif angle < -45:
- angle += 90
-
- return angle
-
-def replace_images_in_markdown(md: str, images: dict[str, str]) -> str:
- """
- Replace image placeholders in markdown with base64-encoded images.
- Uses regex-based matching to handle variations in image IDs and formats.
-
- Args:
- md: Markdown text containing image placeholders
- images: Dictionary mapping image IDs to base64 strings
-
- Returns:
- Markdown text with images replaced by base64 data
- """
- # Process each image ID in the dictionary
- for img_id, base64_str in images.items():
- # Extract the base ID without extension for more flexible matching
- base_id = img_id.split('.')[0]
-
- # Match markdown image pattern where URL contains the base ID
- # Using a single regex with groups to capture the full pattern
- pattern = re.compile(rf'!\[([^\]]*)\]\(([^\)]*{base_id}[^\)]*)\)')
-
- # Process all matches
- matches = list(pattern.finditer(md))
- for match in reversed(matches): # Process in reverse to avoid offset issues
- # Replace the entire match with a properly formatted base64 image
- md = md[:match.start()] + f"" + md[match.end():]
-
- return md
-
-def get_combined_markdown(ocr_response) -> str:
- """
- Combine OCR text and images into a single markdown document.
-
- Args:
- ocr_response: OCR response object from Mistral AI
-
- Returns:
- Combined markdown string with embedded images
- """
- markdowns = []
-
- # Process each page of the OCR response
- for page in ocr_response.pages:
- # Extract image data if available
- image_data = {}
- if hasattr(page, "images"):
- for img in page.images:
- if hasattr(img, "id") and hasattr(img, "image_base64"):
- image_data[img.id] = img.image_base64
-
- # Replace image placeholders with base64 data
- page_markdown = page.markdown if hasattr(page, "markdown") else ""
- processed_markdown = replace_images_in_markdown(page_markdown, image_data)
- markdowns.append(processed_markdown)
-
- # Join all pages' markdown with double newlines
- return "\n\n".join(markdowns)
-
-def encode_image_for_api(image_path: Union[str, Path]) -> str:
- """
- Encode an image as base64 data URL for API submission.
-
- Args:
- image_path: Path to the image file
-
- Returns:
- Base64 data URL for the image
- """
- # Convert to Path object if string
- image_file = Path(image_path) if isinstance(image_path, str) else image_path
-
- # Verify image exists
- if not image_file.is_file():
- raise FileNotFoundError(f"Image file not found: {image_file}")
-
- # Determine mime type based on file extension
- mime_type = 'image/jpeg' # Default mime type
- suffix = image_file.suffix.lower()
- if suffix == '.png':
- mime_type = 'image/png'
- elif suffix == '.gif':
- mime_type = 'image/gif'
- elif suffix in ['.jpg', '.jpeg']:
- mime_type = 'image/jpeg'
- elif suffix == '.pdf':
- mime_type = 'application/pdf'
-
- # Encode image as base64
- encoded = base64.b64encode(image_file.read_bytes()).decode()
- return f"data:{mime_type};base64,{encoded}"
-
-def encode_bytes_for_api(file_bytes: bytes, mime_type: str) -> str:
- """
- Encode binary data as base64 data URL for API submission.
-
- Args:
- file_bytes: Binary file data
- mime_type: MIME type of the file (e.g., 'image/jpeg', 'application/pdf')
-
- Returns:
- Base64 data URL for the data
- """
- # Encode data as base64
- encoded = base64.b64encode(file_bytes).decode()
- return f"data:{mime_type};base64,{encoded}"
-
-def calculate_image_entropy(pil_img: Image.Image) -> float:
- """
- Calculate the entropy of a PIL image.
- Entropy is a measure of randomness; low entropy indicates a blank or simple image,
- high entropy indicates more complex content (e.g., text or detailed images).
-
- Args:
- pil_img: PIL Image object
-
- Returns:
- float: Entropy value
- """
- # Convert to grayscale for entropy calculation
- gray_img = pil_img.convert("L")
- arr = np.array(gray_img)
- # Compute histogram
- hist, _ = np.histogram(arr, bins=256, range=(0, 255), density=True)
- # Remove zero entries to avoid log(0)
- hist = hist[hist > 0]
- # Calculate entropy
- entropy = -np.sum(hist * np.log2(hist))
- return float(entropy)
-
-def estimate_text_density(image_np):
- """
- Estimate text density patterns in an image.
- Returns metrics on text distribution and special cases.
-
- Args:
- image_np: Numpy array of the image
-
- Returns:
- dict: Text density metrics
- """
- # Convert to grayscale
- if len(image_np.shape) > 2 and image_np.shape[2] == 3:
- gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
- else:
- gray = image_np
-
- # Binarize image
- _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
-
- # Analyze vertical text density profile (important for headers/footers)
- height, width = gray.shape
- vertical_profile = np.sum(binary, axis=1) / width
-
- # Analyze horizontal text density profile
- horizontal_profile = np.sum(binary, axis=0) / height
-
- # Calculate statistics
- v_mean = np.mean(vertical_profile)
- v_std = np.std(vertical_profile)
- v_max = np.max(vertical_profile)
-
- # Detect uppercase text regions (common in headers of Baldwin document)
- # Uppercase text tends to have more consistent height and uniform vertical density
- section_height = height // 10 # Divide into 10 vertical sections
- uppercase_sections = 0
-
- for i in range(0, height, section_height):
- section = binary[i:min(i+section_height, height), :]
- section_profile = np.sum(section, axis=1) / width
-
- # Uppercase characteristics: high density with low variation
- if np.mean(section_profile) > v_mean * 1.5 and np.std(section_profile) < v_std * 0.7:
- uppercase_sections += 1
-
- # Determine overall pattern
- if v_std / v_mean > 0.8:
- pattern = 'varied' # High variance indicates sections with different text densities
- else:
- pattern = 'uniform' # Low variance indicates uniform text distribution
-
- return {
- 'mean_density': float(v_mean),
- 'density_variation': float(v_std),
- 'pattern': pattern,
- 'uppercase_sections': uppercase_sections,
- 'max_density': float(v_max)
- }
-
-def serialize_ocr_object(obj):
- """
- Serialize OCR response objects to JSON serializable format.
- Handles OCRImageObject specifically to prevent serialization errors.
-
- Args:
- obj: The object to serialize
-
- Returns:
- JSON serializable representation of the object
- """
- # Fast path: Handle primitive types directly
- if obj is None or isinstance(obj, (str, int, float, bool)):
- return obj
-
- # Handle collections
- if isinstance(obj, list):
- return [serialize_ocr_object(item) for item in obj]
- elif isinstance(obj, dict):
- return {k: serialize_ocr_object(v) for k, v in obj.items()}
- elif isinstance(obj, OCRImageObject):
- # Special handling for OCRImageObject
- return {
- 'id': obj.id if hasattr(obj, 'id') else None,
- 'image_base64': obj.image_base64 if hasattr(obj, 'image_base64') else None
- }
- elif hasattr(obj, '__dict__'):
- # For objects with __dict__ attribute
- return {k: serialize_ocr_object(v) for k, v in obj.__dict__.items()
- if not k.startswith('_')} # Skip private attributes
- else:
- # Try to convert to string as last resort
- try:
- return str(obj)
- except:
- return None
-
-# Clean OCR result with focus on Mistral compatibility
-def clean_ocr_result(result, use_segmentation=False, vision_enabled=True, preprocessing_options=None):
- """
- Clean text content in OCR results, preserving original structure from Mistral API.
- Only removes markdown/HTML conflicts without duplicating content across fields.
-
- Args:
- result: OCR result object or dictionary
- use_segmentation: Whether image segmentation was used
- vision_enabled: Whether vision model was used
- preprocessing_options: Dictionary of preprocessing options
-
- Returns:
- Cleaned result object
- """
- if not result:
- return result
-
- # Import text utilities for cleaning
- try:
- from utils.text_utils import clean_raw_text
- text_cleaner_available = True
- except ImportError:
- text_cleaner_available = False
-
- def clean_text(text):
- """Clean text content, removing markdown image references and base64 data"""
- if not text or not isinstance(text, str):
- return ""
-
- if text_cleaner_available:
- text = clean_raw_text(text)
- else:
- # Remove image references like 
- text = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', text)
-
- # Remove basic markdown image references like 
- text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
-
- # Remove base64 encoded image data
- text = re.sub(r'data:image/[^;]+;base64,[a-zA-Z0-9+/=]+', '', text)
-
- # Clean up any JSON-like image object references
- text = re.sub(r'{"image(_data)?":("[^"]*"|null|true|false|\{[^}]*\}|\[[^\]]*\])}', '', text)
-
- # Clean up excessive whitespace and line breaks created by removals
- text = re.sub(r'\n{3,}', '\n\n', text)
- text = re.sub(r'\s{3,}', ' ', text)
-
- return text.strip()
-
- # Process dictionary
- if isinstance(result, dict):
- # For PDF documents, preserve original structure from Mistral API
- is_pdf = result.get('file_type', '') == 'pdf' or (
- result.get('file_name', '').lower().endswith('.pdf')
- )
-
- # Ensure ocr_contents exists
- if 'ocr_contents' not in result:
- result['ocr_contents'] = {}
-
- # Clean raw_text if it exists but don't duplicate it
- if 'raw_text' in result:
- result['raw_text'] = clean_text(result['raw_text'])
-
- # Handle ocr_contents fields - clean them but don't duplicate
- if 'ocr_contents' in result:
- for key, value in list(result['ocr_contents'].items()):
- # Skip binary fields and image data
- if key in ['image_base64', 'images', 'binary_data'] and value:
- continue
-
- # Clean string values to remove markdown/HTML conflicts
- if isinstance(value, str):
- result['ocr_contents'][key] = clean_text(value)
-
- # Handle segmentation data
- if use_segmentation and preprocessing_options and 'segmentation_data' in preprocessing_options:
- # Store segmentation metadata
- result['segmentation_applied'] = True
-
- # Extract combined text if available
- if 'combined_text' in preprocessing_options['segmentation_data']:
- segmentation_text = clean_text(preprocessing_options['segmentation_data']['combined_text'])
- # Add as dedicated field
- result['ocr_contents']['segmentation_text'] = segmentation_text
-
- # IMPORTANT: For documents with overlapping regions like baldwin-15th-north,
- # the intelligently merged segmentation text is more accurate than the raw OCR
- # Always use segmentation text as the primary source when available
- # This ensures clean, non-duplicated content from overlapping regions
- result['ocr_contents']['raw_text'] = segmentation_text
-
- # Also update the 'text' field which is used in some contexts
- if 'text' in result['ocr_contents']:
- result['ocr_contents']['text'] = segmentation_text
-
- # Clean pages_data if available (Mistral OCR format)
- if 'pages_data' in result:
- for page in result['pages_data']:
- if isinstance(page, dict):
- # Clean text field
- if 'text' in page:
- page['text'] = clean_text(page['text'])
-
- # Clean markdown field
- if 'markdown' in page:
- page['markdown'] = clean_text(page['markdown'])
-
- # Handle list content recursively
- elif isinstance(result, list):
- return [clean_ocr_result(item, use_segmentation, vision_enabled, preprocessing_options)
- for item in result]
-
- return result
-
-def create_results_zip(results, output_dir=None, zip_name=None):
- """
- Create a zip file containing OCR results.
-
- Args:
- results: Dictionary or list of OCR results
- output_dir: Optional output directory
- zip_name: Optional zip file name
-
- Returns:
- Path to the created zip file
- """
- # Create temporary output directory if not provided
- if output_dir is None:
- output_dir = Path.cwd() / "output"
- output_dir.mkdir(exist_ok=True)
- else:
- output_dir = Path(output_dir)
- output_dir.mkdir(exist_ok=True)
-
- # Generate zip name if not provided
- if zip_name is None:
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-
- if isinstance(results, list):
- # For a list of results, create a descriptive name
- file_count = len(results)
- zip_name = f"ocr_results_{file_count}_{timestamp}.zip"
- else:
- # For single result, create descriptive filename
- base_name = results.get('file_name', 'document').split('.')[0]
- zip_name = f"{base_name}_{timestamp}.zip"
-
- try:
- # Get zip data in memory first
- zip_data = create_results_zip_in_memory(results)
-
- # Save to file
- zip_path = output_dir / zip_name
- with open(zip_path, 'wb') as f:
- f.write(zip_data)
-
- return zip_path
- except Exception as e:
- # Create an empty zip file as fallback
- logger.error(f"Error creating zip file: {str(e)}")
- zip_path = output_dir / zip_name
- with zipfile.ZipFile(zip_path, 'w') as zipf:
- zipf.writestr("info.txt", "Could not create complete archive")
-
- return zip_path
-
-def create_results_zip_in_memory(results):
- """
- Create a zip file containing OCR results in memory.
- Packages markdown with embedded image tags, raw text, and JSON file
- in a contextually relevant structure.
-
- Args:
- results: Dictionary or list of OCR results
-
- Returns:
- Binary zip file data
- """
- # Create a BytesIO object
- zip_buffer = io.BytesIO()
-
- # Create a ZipFile instance
- with zipfile.ZipFile(zip_buffer, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
- # Check if results is a list or a dictionary
- is_list = isinstance(results, list)
-
- if is_list:
- # Handle multiple results by creating subdirectories
- for idx, result in enumerate(results):
- if result and isinstance(result, dict):
- # Create a folder name based on the file name or index
- folder_name = result.get('file_name', f'document_{idx+1}')
- folder_name = Path(folder_name).stem # Remove file extension
-
- # Add files to this folder
- add_result_files_to_zip(zipf, result, f"{folder_name}/")
- else:
- # Single result - add files directly to root of zip
- add_result_files_to_zip(zipf, results)
-
- # Seek to the beginning of the BytesIO object
- zip_buffer.seek(0)
-
- # Return the zip file bytes
- return zip_buffer.getvalue()
-
-def truncate_base64_in_result(result, prefix_length=32, suffix_length=32):
- """
- Create a copy of the result dictionary with base64 image data truncated.
- This keeps the structure intact while making the JSON more readable.
-
- Args:
- result: OCR result dictionary
- prefix_length: Number of characters to keep at the beginning
- suffix_length: Number of characters to keep at the end
-
- Returns:
- Dictionary with truncated base64 data
- """
- if not result or not isinstance(result, dict):
- return {}
-
- # Create a deep copy to avoid modifying the original
- import copy
- truncated_result = copy.deepcopy(result)
-
- # Helper function to truncate base64 strings
- def truncate_base64(data):
- if not isinstance(data, str) or len(data) <= prefix_length + suffix_length + 10:
- return data
-
- # Extract prefix and suffix based on whether this is a data URI or raw base64
- if data.startswith('data:'):
- # Handle data URIs like '...'
- parts = data.split(',', 1)
- if len(parts) != 2:
- return data # Unexpected format, return as is
-
- header = parts[0] + ','
- base64_content = parts[1]
-
- if len(base64_content) <= prefix_length + suffix_length + 10:
- return data # Not long enough to truncate
-
- truncated = (f"{header}{base64_content[:prefix_length]}..."
- f"[truncated {len(base64_content) - prefix_length - suffix_length} chars]..."
- f"{base64_content[-suffix_length:]}")
- else:
- # Handle raw base64 strings
- truncated = (f"{data[:prefix_length]}..."
- f"[truncated {len(data) - prefix_length - suffix_length} chars]..."
- f"{data[-suffix_length:]}")
-
- return truncated
-
- # Helper function to recursively truncate base64 in nested structures
- def truncate_base64_recursive(obj):
- if isinstance(obj, dict):
- # Check for keys that typically contain base64 data
- for key in list(obj.keys()):
- if key in ['image_base64', 'base64'] and isinstance(obj[key], str):
- obj[key] = truncate_base64(obj[key])
- elif isinstance(obj[key], (dict, list)):
- truncate_base64_recursive(obj[key])
- elif isinstance(obj, list):
- for item in obj:
- if isinstance(item, (dict, list)):
- truncate_base64_recursive(item)
-
- # Truncate base64 data throughout the result
- truncate_base64_recursive(truncated_result)
-
- # Specifically handle the pages_data structure
- if 'pages_data' in truncated_result:
- for page in truncated_result['pages_data']:
- if isinstance(page, dict) and 'images' in page:
- for img in page['images']:
- if isinstance(img, dict) and 'image_base64' in img and isinstance(img['image_base64'], str):
- img['image_base64'] = truncate_base64(img['image_base64'])
-
- # Handle raw_response_data if present
- if 'raw_response_data' in truncated_result and isinstance(truncated_result['raw_response_data'], dict):
- if 'pages' in truncated_result['raw_response_data']:
- for page in truncated_result['raw_response_data']['pages']:
- if isinstance(page, dict) and 'images' in page:
- for img in page['images']:
- if isinstance(img, dict) and 'base64' in img and isinstance(img['base64'], str):
- img['base64'] = truncate_base64(img['base64'])
-
- return truncated_result
-
-def clean_base64_from_result(result):
- """
- Create a clean copy of the result dictionary with base64 image data removed.
- This ensures JSON files don't contain large base64 strings.
-
- Args:
- result: OCR result dictionary
-
- Returns:
- Cleaned dictionary without base64 data
- """
- if not result or not isinstance(result, dict):
- return {}
-
- # Create a deep copy to avoid modifying the original
- import copy
- clean_result = copy.deepcopy(result)
-
- # Helper function to recursively clean base64 from nested structures
- def clean_base64_recursive(obj):
- if isinstance(obj, dict):
- # Check for keys that typically contain base64 data
- for key in list(obj.keys()):
- if key in ['image_base64', 'base64']:
- obj[key] = "[BASE64_DATA_REMOVED]"
- elif isinstance(obj[key], (dict, list)):
- clean_base64_recursive(obj[key])
- elif isinstance(obj, list):
- for item in obj:
- if isinstance(item, (dict, list)):
- clean_base64_recursive(item)
-
- # Clean the entire result
- clean_base64_recursive(clean_result)
-
- # Specifically handle the pages_data structure
- if 'pages_data' in clean_result:
- for page in clean_result['pages_data']:
- if isinstance(page, dict) and 'images' in page:
- for img in page['images']:
- if isinstance(img, dict) and 'image_base64' in img:
- img['image_base64'] = "[BASE64_DATA_REMOVED]"
-
- # Handle raw_response_data if present
- if 'raw_response_data' in clean_result and isinstance(clean_result['raw_response_data'], dict):
- if 'pages' in clean_result['raw_response_data']:
- for page in clean_result['raw_response_data']['pages']:
- if isinstance(page, dict) and 'images' in page:
- for img in page['images']:
- if isinstance(img, dict) and 'base64' in img:
- img['base64'] = "[BASE64_DATA_REMOVED]"
-
- return clean_result
-
-def create_markdown_with_file_references(result, image_path_prefix="images/"):
- """
- Create a markdown document with file references to images instead of base64 embedding.
- Ideal for use in zip archives where images are stored as separate files.
-
- Args:
- result: OCR result dictionary
- image_path_prefix: Path prefix for image references (e.g., "images/")
-
- Returns:
- Markdown content as string with file references
- """
- # Similar to create_markdown_with_images but uses file references
- # Import content utils to use classification functions
- try:
- from utils.content_utils import classify_document_content, extract_document_text, extract_image_description
- content_utils_available = True
- except ImportError:
- content_utils_available = False
-
- # Get content classification
- has_text = True
- has_images = False
-
- if content_utils_available:
- classification = classify_document_content(result)
- has_text = classification['has_content']
- has_images = result.get('has_images', False)
- else:
- # Minimal fallback detection
- if 'has_images' in result:
- has_images = result['has_images']
-
- # Check for image data more thoroughly
- if 'pages_data' in result and isinstance(result['pages_data'], list):
- for page in result['pages_data']:
- if isinstance(page, dict) and 'images' in page and page['images']:
- has_images = True
- break
-
- # Start building the markdown document
- md = []
-
- # Add document title/header
- md.append(f"# {result.get('file_name', 'Document')}\n")
-
- # Add metadata section
- md.append("## Document Metadata\n")
-
- # Add timestamp
- if 'timestamp' in result:
- md.append(f"**Processed:** {result['timestamp']}\n")
-
- # Add languages if available
- if 'languages' in result and result['languages']:
- languages = [lang for lang in result['languages'] if lang]
- if languages:
- md.append(f"**Languages:** {', '.join(languages)}\n")
-
- # Add document type and topics
- if 'detected_document_type' in result:
- md.append(f"**Document Type:** {result['detected_document_type']}\n")
-
- if 'topics' in result and result['topics']:
- md.append(f"**Topics:** {', '.join(result['topics'])}\n")
-
- md.append("\n---\n")
-
- # Document title - extract from result if available
- if 'ocr_contents' in result and 'title' in result['ocr_contents'] and result['ocr_contents']['title']:
- title_content = result['ocr_contents']['title']
- md.append(f"## {title_content}\n")
-
- # Add images if present
- if has_images and 'pages_data' in result:
- md.append("## Images\n")
-
- # Extract and display all images with file references
- for page_idx, page in enumerate(result['pages_data']):
- if 'images' in page and isinstance(page['images'], list):
- for img_idx, img in enumerate(page['images']):
- if 'image_base64' in img:
- # Create image reference to file in the zip
- image_filename = f"image_{page_idx+1}_{img_idx+1}.jpg"
- image_path = f"{image_path_prefix}{image_filename}"
- image_caption = f"Image {page_idx+1}-{img_idx+1}"
- md.append(f"\n")
-
- # Add image description if available through utils
- if content_utils_available:
- description = extract_image_description(result)
- if description:
- md.append(f"*{description}*\n")
-
- md.append("\n---\n")
-
- # Add document text section
- md.append("## Text Content\n")
-
- # Extract text content systematically
- text_content = ""
- structured_sections = {}
-
- # Helper function to extract clean text from dictionary objects
- def extract_clean_text(content):
- if isinstance(content, str):
- # Check if content is a stringified JSON
- if content.strip().startswith("{") and content.strip().endswith("}"):
- try:
- # Try to parse as JSON
- content_dict = json.loads(content.replace("'", '"'))
- if 'text' in content_dict:
- return content_dict['text']
- return content
- except:
- return content
- return content
- elif isinstance(content, dict):
- # If it's a dictionary with a 'text' key, return just that value
- if 'text' in content and isinstance(content['text'], str):
- return content['text']
- return content
- return content
-
- if content_utils_available:
- # Use the systematic utility function for main text
- text_content = extract_document_text(result)
- text_content = extract_clean_text(text_content)
-
- # Collect all available structured sections
- if 'ocr_contents' in result:
- for field, content in result['ocr_contents'].items():
- # Skip certain fields that are handled separately
- if field in ["raw_text", "error", "partial_text", "main_text"]:
- continue
-
- if content:
- # Extract clean text from content if possible
- clean_content = extract_clean_text(content)
- # Add this as a structured section
- structured_sections[field] = clean_content
- else:
- # Fallback extraction logic
- if 'ocr_contents' in result:
- # First find main text
- for field in ["main_text", "content", "text", "transcript", "raw_text"]:
- if field in result['ocr_contents'] and result['ocr_contents'][field]:
- content = result['ocr_contents'][field]
- if isinstance(content, str) and content.strip():
- text_content = content
- break
- elif isinstance(content, dict):
- # Try to convert complex objects to string
- try:
- text_content = json.dumps(content, indent=2)
- break
- except:
- pass
-
- # Then collect all structured sections
- for field, content in result['ocr_contents'].items():
- # Skip certain fields that are handled separately
- if field in ["raw_text", "error", "partial_text", "main_text", "content", "text", "transcript"]:
- continue
-
- if content:
- # Add this as a structured section
- structured_sections[field] = content
-
- # Add the main text content - display raw text without a field label
- if text_content:
- # Check if this is from raw_text (based on content match)
- is_raw_text = False
- if 'ocr_contents' in result and 'raw_text' in result['ocr_contents']:
- if result['ocr_contents']['raw_text'] == text_content:
- is_raw_text = True
-
- # Display content without adding a "raw_text:" label
- md.append(text_content + "\n\n")
-
- # Add structured sections if available
- if structured_sections:
- for section_name, section_content in structured_sections.items():
- # Use proper markdown header for sections - consistently capitalize all section names
- display_name = section_name.replace("_", " ").capitalize()
- # Handle different content types
- if isinstance(section_content, str):
- md.append(section_content + "\n\n")
- elif isinstance(section_content, dict):
- # Dictionary content - format as key-value pairs
- for key, value in section_content.items():
- # Treat all values as plain text to maintain content purity
- # This prevents JSON-like structures from being formatted as code blocks
- md.append(f"**{key}:** {value}\n\n")
- elif isinstance(section_content, list):
- # List content - create a markdown list
- for item in section_content:
- # Treat all items as plain text
- md.append(f"- {item}\n")
- md.append("\n")
-
- # Join all markdown parts into a single string
- return "\n".join(md)
-
-def add_result_files_to_zip(zipf, result, prefix=""):
- """
- Add files for a single result to a zip file.
-
- Args:
- zipf: ZipFile instance to add files to
- result: OCR result dictionary
- prefix: Optional prefix for file paths in the zip
- """
- if not result or not isinstance(result, dict):
- return
-
- # Create a timestamp for filename if not in result
- timestamp = result.get('timestamp', datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
-
- # Get base name for files
- file_name = result.get('file_name', 'document')
- base_name = Path(file_name).stem
-
- try:
- # 1. Add JSON file - with base64 data cleaned out
- clean_result = clean_base64_from_result(result)
- json_str = json.dumps(clean_result, indent=2)
- zipf.writestr(f"{prefix}{base_name}.json", json_str)
-
- # 2. Add markdown file that exactly matches Tab 1 display
- # Use the create_markdown_with_images function to ensure it matches the UI exactly
- try:
- markdown_content = create_markdown_with_images(result)
- zipf.writestr(f"{prefix}{base_name}.md", markdown_content)
- except Exception as e:
- logger.error(f"Error creating markdown: {str(e)}")
- # Fallback to simpler markdown if error occurs
- zipf.writestr(f"{prefix}{base_name}.md", f"# {file_name}\n\nError generating complete markdown output.")
-
- # Extract and save images first to ensure they exist before creating markdown
- img_paths = {}
- has_images = result.get('has_images', False)
-
- # 3. Add individual images if available
- if has_images and 'pages_data' in result:
- img_folder = f"{prefix}images/"
- for page_idx, page in enumerate(result['pages_data']):
- if 'images' in page and isinstance(page['images'], list):
- for img_idx, img in enumerate(page['images']):
- if 'image_base64' in img and img['image_base64']:
- # Extract the base64 data
- try:
- # Get the base64 data
- img_data = img['image_base64']
-
- # Handle the base64 data carefully
- if isinstance(img_data, str):
- # If it has a data URI prefix, remove it
- if ',' in img_data and ';base64,' in img_data:
- # Keep the complete data after the comma
- img_data = img_data.split(',', 1)[1]
-
- # Make sure we have the complete data (not truncated)
- try:
- # Decode the base64 data with padding correction
- # Add padding if needed to prevent truncation errors
- missing_padding = len(img_data) % 4
- if missing_padding:
- img_data += '=' * (4 - missing_padding)
- img_bytes = base64.b64decode(img_data)
- except Exception as e:
- logger.error(f"Base64 decoding error: {str(e)} for image {page_idx}-{img_idx}")
- # Skip this image if we can't decode it
- continue
- else:
- # If it's not a string (e.g., already bytes), use it directly
- img_bytes = img_data
-
- # Create image filename
- image_filename = f"image_{page_idx+1}_{img_idx+1}.jpg"
- img_paths[(page_idx, img_idx)] = image_filename
-
- # Write the image to the zip file
- zipf.writestr(f"{img_folder}{image_filename}", img_bytes)
- except Exception as e:
- logger.warning(f"Could not add image to zip: {str(e)}")
-
- # 4. Add markdown with file references to images for offline viewing
- try:
- if has_images:
- # Create markdown with file references
- file_ref_markdown = create_markdown_with_file_references(result, "images/")
- zipf.writestr(f"{prefix}{base_name}_with_files.md", file_ref_markdown)
- except Exception as e:
- logger.warning(f"Error creating markdown with file references: {str(e)}")
-
- # 5. Add README.txt with explanation of file contents
- readme_content = f"""
-OCR RESULTS FOR: {file_name}
-Processed: {timestamp}
-
-This archive contains the following files:
-
-- {base_name}.json: Complete JSON data with all extracted information
-- {base_name}.md: Markdown document with embedded base64 images (exactly as shown in the app)
-- {base_name}_with_files.md: Alternative markdown with file references instead of base64 (for offline viewing)
-- images/ folder: Contains extracted images from the document (if present)
-
-Generated by Historical OCR using Mistral AI
- """
- zipf.writestr(f"{prefix}README.txt", readme_content.strip())
-
- except Exception as e:
- logger.error(f"Error adding files to zip: {str(e)}")
-
-def create_markdown_with_images(result):
- """
- Create a clean Markdown document from OCR results that properly preserves
- image references and text structure, following the principle of content purity.
-
- Args:
- result: OCR result dictionary
-
- Returns:
- Markdown content as string
- """
- # Similar to create_markdown_with_file_references but embeds base64 images
- # Import content utils to use classification functions
- try:
- from utils.content_utils import classify_document_content, extract_document_text, extract_image_description
- content_utils_available = True
- except ImportError:
- content_utils_available = False
-
- # Get content classification
- has_text = True
- has_images = False
-
- if content_utils_available:
- classification = classify_document_content(result)
- has_text = classification['has_content']
- has_images = result.get('has_images', False)
- else:
- # Minimal fallback detection
- if 'has_images' in result:
- has_images = result['has_images']
-
- # Check for image data more thoroughly
- if 'pages_data' in result and isinstance(result['pages_data'], list):
- for page in result['pages_data']:
- if isinstance(page, dict) and 'images' in page and page['images']:
- has_images = True
- break
-
- # Start building the markdown document
- md = []
-
- # Add document title/header
- md.append(f"# {result.get('file_name', 'Document')}\n")
-
- # Add metadata section
- md.append("## Document Metadata\n")
-
- # Add timestamp
- if 'timestamp' in result:
- md.append(f"**Processed:** {result['timestamp']}\n")
-
- # Add languages if available
- if 'languages' in result and result['languages']:
- languages = [lang for lang in result['languages'] if lang]
- if languages:
- md.append(f"**Languages:** {', '.join(languages)}\n")
-
- # Add document type and topics
- if 'detected_document_type' in result:
- md.append(f"**Document Type:** {result['detected_document_type']}\n")
-
- if 'topics' in result and result['topics']:
- md.append(f"**Topics:** {', '.join(result['topics'])}\n")
-
- md.append("\n---\n")
-
- # Document title - extract from result if available
- if 'ocr_contents' in result and 'title' in result['ocr_contents'] and result['ocr_contents']['title']:
- title_content = result['ocr_contents']['title']
- md.append(f"## {title_content}\n")
-
- # Add images if present - with base64 embedding
- if has_images and 'pages_data' in result:
- md.append("## Images\n")
-
- # Extract and display all images with embedded base64
- for page_idx, page in enumerate(result['pages_data']):
- if 'images' in page and isinstance(page['images'], list):
- for img_idx, img in enumerate(page['images']):
- if 'image_base64' in img:
- # Use the base64 data directly
- image_caption = f"Image {page_idx+1}-{img_idx+1}"
- img_data = img['image_base64']
-
- # Make sure it has proper data URI format
- if isinstance(img_data, str) and not img_data.startswith('data:'):
- img_data = f"data:image/jpeg;base64,{img_data}"
-
- md.append(f"\n")
-
- # Add image description if available through utils
- if content_utils_available:
- description = extract_image_description(result)
- if description:
- md.append(f"*{description}*\n")
-
- md.append("\n---\n")
-
- # Add document text section
- md.append("## Text Content\n")
-
- # Extract text content systematically
- text_content = ""
- structured_sections = {}
-
- if content_utils_available:
- # Use the systematic utility function for main text
- text_content = extract_document_text(result)
-
- # Collect all available structured sections
- if 'ocr_contents' in result:
- for field, content in result['ocr_contents'].items():
- # Skip certain fields that are handled separately
- if field in ["raw_text", "error", "partial_text", "main_text"]:
- continue
-
- if content:
- # Add this as a structured section
- structured_sections[field] = content
- else:
- # Fallback extraction logic
- if 'ocr_contents' in result:
- # First find main text
- for field in ["main_text", "content", "text", "transcript", "raw_text"]:
- if field in result['ocr_contents'] and result['ocr_contents'][field]:
- content = result['ocr_contents'][field]
- if isinstance(content, str) and content.strip():
- text_content = content
- break
- elif isinstance(content, dict):
- # Try to convert complex objects to string
- try:
- text_content = json.dumps(content, indent=2)
- break
- except:
- pass
-
- # Then collect all structured sections
- for field, content in result['ocr_contents'].items():
- # Skip certain fields that are handled separately
- if field in ["raw_text", "error", "partial_text", "main_text", "content", "text", "transcript"]:
- continue
-
- if content:
- # Add this as a structured section
- structured_sections[field] = content
-
- # Add the main text content
- if text_content:
- md.append(text_content + "\n\n")
-
- # Add structured sections if available
- if structured_sections:
- for section_name, section_content in structured_sections.items():
- # Use proper markdown header for sections - consistently capitalize all section names
- display_name = section_name.replace("_", " ").capitalize()
- md.append(f"### {display_name}\n")
- # Add a separator for clarity
- md.append("\n---\n\n")
-
- # Handle different content types
- if isinstance(section_content, str):
- md.append(section_content + "\n\n")
- elif isinstance(section_content, dict):
- # Dictionary content - format as key-value pairs
- for key, value in section_content.items():
- # Treat all values as plain text to maintain content purity
- md.append(f"**{key}:** {value}\n\n")
- elif isinstance(section_content, list):
- # List content - create a markdown list
- for item in section_content:
- # Keep list items as plain text
- md.append(f"- {item}\n")
- md.append("\n")
-
- # Join all markdown parts into a single string
- return "\n".join(md)
diff --git a/utils/pdf_ocr.py b/utils/pdf_ocr.py
deleted file mode 100644
index 0a9107f821b17e6d6d880575fb98f94a05d7d181..0000000000000000000000000000000000000000
--- a/utils/pdf_ocr.py
+++ /dev/null
@@ -1,457 +0,0 @@
-#!/usr/bin/env python3
-"""
-PDFOCR - Module for processing PDF files with OCR and extracting structured data.
-Provides robust PDF to image conversion before OCR processing.
-"""
-
-import json
-import os
-import tempfile
-import logging
-from pathlib import Path
-from typing import Optional, Dict, List, Union, Tuple, Any
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("pdf_ocr")
-
-# Import StructuredOCR for OCR processing
-from structured_ocr import StructuredOCR
-
-class PDFConversionResult:
- """Class to hold results of PDF to image conversion."""
-
- def __init__(self,
- success: bool,
- images: List[Path] = None,
- error: str = None,
- page_count: int = 0,
- temp_files: List[str] = None):
- """Initialize the conversion result.
-
- Args:
- success: Whether the conversion was successful
- images: List of paths to the converted images
- error: Error message if conversion failed
- page_count: Total number of pages in the PDF
- temp_files: List of temporary files that should be cleaned up
- """
- self.success = success
- self.images = images or []
- self.error = error
- self.page_count = page_count
- self.temp_files = temp_files or []
-
- def __bool__(self):
- """Enable boolean evaluation of the result."""
- return self.success
-
- def cleanup(self):
- """Clean up any temporary files created during conversion."""
- for temp_file in self.temp_files:
- try:
- if os.path.exists(temp_file):
- os.unlink(temp_file)
- logger.debug(f"Removed temporary file: {temp_file}")
- except Exception as e:
- logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
- self.temp_files = []
-
-
-class PDFOCR:
- """Class for processing PDF files with OCR and extracting structured data."""
-
- def __init__(self, api_key=None):
- """Initialize the PDF OCR processor."""
- self.processor = StructuredOCR(api_key=api_key)
- self.temp_files = []
-
- def __del__(self):
- """Clean up resources when object is destroyed."""
- self.cleanup()
-
- def cleanup(self):
- """Clean up any temporary files."""
- for temp_file in self.temp_files:
- try:
- if os.path.exists(temp_file):
- os.unlink(temp_file)
- logger.debug(f"Removed temporary file: {temp_file}")
- except Exception as e:
- logger.warning(f"Failed to remove temporary file {temp_file}: {e}")
- self.temp_files = []
-
- def convert_pdf_to_images(self,
- pdf_path: Union[str, Path],
- dpi: int = 200,
- max_pages: Optional[int] = None,
- page_numbers: Optional[List[int]] = None) -> PDFConversionResult:
- """
- Convert a PDF file to images.
-
- Args:
- pdf_path: Path to the PDF file
- dpi: DPI for the output images
- max_pages: Maximum number of pages to convert (None for all)
- page_numbers: Specific page numbers to convert (1-based indexing)
-
- Returns:
- PDFConversionResult object with conversion results
- """
- pdf_path = Path(pdf_path)
- if not pdf_path.exists():
- return PDFConversionResult(
- success=False,
- error=f"PDF file not found: {pdf_path}"
- )
-
- # Check file size
- file_size_mb = pdf_path.stat().st_size / (1024 * 1024)
- logger.info(f"PDF size: {file_size_mb:.2f} MB")
-
- try:
- # Import pdf2image for conversion
- import pdf2image
-
- # Initialize list for temporary files
- temp_files = []
-
- # Optimize conversion parameters based on file size
- thread_count = min(4, os.cpu_count() or 2)
-
- # First, determine total pages in the document
- logger.info("Determining PDF page count...")
- try:
- # Use a lightweight approach with multi-threading for faster processing
- pdf_info = pdf2image.convert_from_path(
- pdf_path,
- dpi=72, # Low DPI just for info
- first_page=1,
- last_page=1,
- size=(100, 100), # Tiny image to save memory
- fmt="jpeg",
- thread_count=thread_count,
- output_file=None
- )
-
- # Get page count from poppler info if available
- if hasattr(pdf_info, 'n_pages'):
- total_pages = pdf_info.n_pages
- else:
- # Try a different approach to get page count
- try:
- from pypdf import PdfReader
- reader = PdfReader(pdf_path)
- total_pages = len(reader.pages)
- except:
- total_pages = 1
- logger.warning("Could not determine total page count, assuming 1 page")
- except Exception as e:
- logger.warning(f"Failed to determine page count: {e}")
- total_pages = 1
-
- logger.info(f"PDF has {total_pages} total pages")
-
- # Determine which pages to process
- pages_to_process = []
-
- # If specific pages are requested, use those
- if page_numbers and any(1 <= p <= total_pages for p in page_numbers):
- pages_to_process = [p for p in page_numbers if 1 <= p <= total_pages]
- logger.info(f"Converting {len(pages_to_process)} specified pages: {pages_to_process}")
- # If max_pages is set, limit to that number
- elif max_pages and max_pages < total_pages:
- pages_to_process = list(range(1, max_pages + 1))
- logger.info(f"Converting first {max_pages} pages of {total_pages} total")
- # Otherwise convert all pages if reasonable count
- else:
- pages_to_process = list(range(1, total_pages + 1))
- logger.info(f"Converting all {total_pages} pages")
-
- # Convert PDF to images
- converted_images = []
-
- # Process in batches for better memory management
- batch_size = min(5, len(pages_to_process)) # Process up to 5 pages at a time
- for i in range(0, len(pages_to_process), batch_size):
- batch_pages = pages_to_process[i:i+batch_size]
- logger.info(f"Converting batch of pages {batch_pages}")
-
- # Convert this batch of pages
- try:
- batch_images = pdf2image.convert_from_path(
- pdf_path,
- dpi=dpi,
- first_page=min(batch_pages),
- last_page=max(batch_pages),
- thread_count=thread_count,
- fmt="jpeg"
- )
-
- # Map converted images to requested page numbers
- for idx, page_num in enumerate(range(min(batch_pages), max(batch_pages) + 1)):
- if page_num in pages_to_process and idx < len(batch_images):
- # Save the image to a temporary file
- img_temp_path = tempfile.NamedTemporaryFile(suffix=f'_page{page_num}.jpg', delete=False).name
- batch_images[idx].save(img_temp_path, format='JPEG', quality=95)
-
- # Add to results and track the temp file
- converted_images.append((page_num, Path(img_temp_path)))
- temp_files.append(img_temp_path)
- except Exception as e:
- logger.error(f"Failed to convert batch {batch_pages}: {e}")
- # Continue with other batches
-
- # Sort by page number to ensure correct order
- converted_images.sort(key=lambda x: x[0])
-
- # Extract just the image paths in correct page order
- image_paths = [img_path for _, img_path in converted_images]
-
- if not image_paths:
- # No images were successfully converted
- return PDFConversionResult(
- success=False,
- error="Failed to convert PDF to images",
- page_count=total_pages,
- temp_files=temp_files
- )
-
- # Store temp files for later cleanup
- self.temp_files.extend(temp_files)
-
- # Return successful result
- return PDFConversionResult(
- success=True,
- images=image_paths,
- page_count=total_pages,
- temp_files=temp_files
- )
-
- except ImportError:
- return PDFConversionResult(
- success=False,
- error="pdf2image module not available. Please install with: pip install pdf2image"
- )
- except Exception as e:
- logger.error(f"PDF conversion error: {str(e)}")
- return PDFConversionResult(
- success=False,
- error=f"Failed to convert PDF to images: {str(e)}"
- )
-
- def process_pdf(self, pdf_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
- """
- Process a PDF file with OCR and extract structured data.
-
- Args:
- pdf_path: Path to the PDF file
- use_vision: Whether to use vision model for improved analysis
- max_pages: Maximum number of pages to process
- custom_pages: Specific page numbers to process (1-based indexing)
- custom_prompt: Custom instructions for processing
-
- Returns:
- Dictionary with structured OCR results
- """
- pdf_path = Path(pdf_path)
- if not pdf_path.exists():
- raise FileNotFoundError(f"PDF file not found: {pdf_path}")
-
- # Convert page numbers to list if provided
- page_numbers = None
- if custom_pages:
- if isinstance(custom_pages, (list, tuple)):
- page_numbers = custom_pages
- else:
- try:
- # Try to parse as comma-separated string
- page_numbers = [int(p.strip()) for p in str(custom_pages).split(',')]
- except:
- logger.warning(f"Invalid custom_pages format: {custom_pages}. Should be list or comma-separated string.")
-
- # First try our optimized PDF to image conversion
- conversion_result = self.convert_pdf_to_images(
- pdf_path=pdf_path,
- max_pages=max_pages,
- page_numbers=page_numbers
- )
-
- if conversion_result.success and conversion_result.images:
- logger.info(f"Successfully converted PDF to {len(conversion_result.images)} images")
-
- # Determine if we need to add PDF-specific context to the prompt
- modified_prompt = custom_prompt
- if not modified_prompt:
- modified_prompt = f"This is a multi-page PDF document with {conversion_result.page_count} total pages, of which {len(conversion_result.images)} were processed."
- elif "pdf" not in modified_prompt.lower() and "multi-page" not in modified_prompt.lower():
- modified_prompt += f" This is a multi-page PDF document with {conversion_result.page_count} total pages, of which {len(conversion_result.images)} were processed."
-
- try:
- # First process the first page with vision if requested
- first_page_result = self.processor.process_file(
- file_path=conversion_result.images[0],
- file_type="image",
- use_vision=use_vision,
- custom_prompt=modified_prompt
- )
-
- # Process additional pages if available
- all_pages_text = []
- all_languages = set()
-
- # Extract text from first page
- if 'ocr_contents' in first_page_result and 'raw_text' in first_page_result['ocr_contents']:
- all_pages_text.append(first_page_result['ocr_contents']['raw_text'])
-
- # Track languages from first page
- if 'languages' in first_page_result:
- for lang in first_page_result['languages']:
- all_languages.add(str(lang))
-
- # Process additional pages if any
- for i, img_path in enumerate(conversion_result.images[1:], 1):
- try:
- # Simple text extraction for additional pages
- page_result = self.processor.process_file(
- file_path=img_path,
- file_type="image",
- use_vision=False, # Use simpler processing for additional pages
- custom_prompt=f"This is page {i+1} of a {conversion_result.page_count}-page document."
- )
-
- # Extract text
- if 'ocr_contents' in page_result and 'raw_text' in page_result['ocr_contents']:
- all_pages_text.append(page_result['ocr_contents']['raw_text'])
-
- # Track languages
- if 'languages' in page_result:
- for lang in page_result['languages']:
- all_languages.add(str(lang))
- except Exception as e:
- logger.warning(f"Error processing page {i+1}: {e}")
-
- # Combine all text into a single document
- combined_text = "\n\n".join(all_pages_text)
-
- # Update the first page result with combined data
- if 'ocr_contents' in first_page_result:
- first_page_result['ocr_contents']['raw_text'] = combined_text
-
- # Update languages with all detected languages
- if all_languages:
- first_page_result['languages'] = list(all_languages)
-
- # Add PDF metadata
- first_page_result['file_name'] = pdf_path.name
- first_page_result['file_type'] = "pdf"
- first_page_result['total_pages'] = conversion_result.page_count
- first_page_result['processed_pages'] = len(conversion_result.images)
-
- # Add conversion info
- first_page_result['pdf_conversion'] = {
- "method": "pdf2image",
- "pages_converted": len(conversion_result.images),
- "pages_requested": len(page_numbers) if page_numbers else (max_pages or conversion_result.page_count)
- }
-
- return first_page_result
- except Exception as e:
- logger.error(f"Error processing converted images: {e}")
- # Fall back to direct processing via StructuredOCR
-
- finally:
- # Clean up temporary files
- conversion_result.cleanup()
-
- # If conversion failed or processing the images failed, fall back to direct processing
- logger.info(f"Using direct StructuredOCR processing for PDF")
- return self.processor.process_file(
- file_path=pdf_path,
- file_type="pdf",
- use_vision=use_vision,
- max_pages=max_pages,
- custom_pages=custom_pages,
- custom_prompt=custom_prompt
- )
-
- def save_json_output(self, pdf_path, output_path, use_vision=True, max_pages=None, custom_pages=None, custom_prompt=None):
- """
- Process a PDF file and save the structured output as JSON.
-
- Args:
- pdf_path: Path to the PDF file
- output_path: Path where to save the JSON output
- use_vision: Whether to use vision model for improved analysis
- max_pages: Maximum number of pages to process
- custom_pages: Specific page numbers to process (1-based indexing)
- custom_prompt: Custom instructions for processing
-
- Returns:
- Path to the saved JSON file
- """
- # Process the PDF
- result = self.process_pdf(
- pdf_path,
- use_vision=use_vision,
- max_pages=max_pages,
- custom_pages=custom_pages,
- custom_prompt=custom_prompt
- )
-
- # Save the result to JSON
- output_path = Path(output_path)
- output_path.parent.mkdir(parents=True, exist_ok=True)
-
- with open(output_path, 'w') as f:
- json.dump(result, f, indent=2)
-
- return output_path
-
-# For testing directly
-if __name__ == "__main__":
- import sys
- import argparse
-
- parser = argparse.ArgumentParser(description="Process PDF files with OCR.")
- parser.add_argument("pdf_path", help="Path to the PDF file to process")
- parser.add_argument("--output", "-o", help="Path to save the output JSON")
- parser.add_argument("--no-vision", dest="use_vision", action="store_false",
- help="Disable vision model for processing")
- parser.add_argument("--max-pages", type=int, help="Maximum number of pages to process")
- parser.add_argument("--pages", help="Specific pages to process (comma-separated)")
- parser.add_argument("--prompt", help="Custom prompt for processing")
-
- args = parser.parse_args()
-
- processor = PDFOCR()
-
- # Parse custom pages if provided
- custom_pages = None
- if args.pages:
- try:
- custom_pages = [int(p.strip()) for p in args.pages.split(',')]
- except:
- print(f"Error parsing pages: {args.pages}. Should be comma-separated list of numbers.")
- sys.exit(1)
-
- if args.output:
- result_path = processor.save_json_output(
- args.pdf_path,
- args.output,
- use_vision=args.use_vision,
- max_pages=args.max_pages,
- custom_pages=custom_pages,
- custom_prompt=args.prompt
- )
- print(f"Results saved to: {result_path}")
- else:
- result = processor.process_pdf(
- args.pdf_path,
- use_vision=args.use_vision,
- max_pages=args.max_pages,
- custom_pages=custom_pages,
- custom_prompt=args.prompt
- )
- print(json.dumps(result, indent=2))
diff --git a/utils/text_utils.py b/utils/text_utils.py
deleted file mode 100644
index 7eafbda672a12751344d1082d392720267775317..0000000000000000000000000000000000000000
--- a/utils/text_utils.py
+++ /dev/null
@@ -1,279 +0,0 @@
-"""
-Utility functions for text processing.
-Contains helper functions for working with text data from OCR.
-"""
-
-import re
-import logging
-import difflib
-from typing import List, Dict, Any, Optional
-
-# Configure logging
-logging.basicConfig(level=logging.INFO,
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-
-def format_ocr_text(text: str, for_display: bool = False) -> str:
- """
- Format OCR text for display or processing.
- This function maintains clean separation between data and presentation.
-
- Args:
- text: OCR text to format
- for_display: Whether to format for display (HTML) or plain text
-
- Returns:
- Formatted text
- """
- if not text:
- return ""
-
- # Clean the text first
- text = clean_raw_text(text)
-
- # Basic text formatting (line breaks, etc.)
- formatted_text = text.replace("\n", " " if for_display else "\n")
-
- if for_display:
- # For display, wrap in paragraph tags but avoid unnecessary divs
- # to maintain content purity
- return f"
{formatted_text}
"
- else:
- # For processing, return clean text only - no markup
- return formatted_text
-
-def format_markdown_text(text: str, preserve_format: bool = True) -> str:
- """
- Format text as Markdown, preserving or enhancing its structure.
- Ensures that text has clean markdown formatting without introducing
- unnecessary presentation elements.
-
- Args:
- text: Raw text to format as Markdown
- preserve_format: Whether to preserve original formatting
-
- Returns:
- Markdown-formatted text
- """
- if not text:
- return ""
-
- # Clean the text first
- text = clean_raw_text(text)
-
- # Normalize line endings
- text = text.replace('\r\n', '\n').replace('\r', '\n')
-
- # Preserve paragraphs if requested
- if preserve_format:
- # Ensure paragraphs are separated by double line breaks
- text = re.sub(r'\n{3,}', '\n\n', text)
- else:
- # Convert single line breaks within paragraphs to spaces
- text = re.sub(r'(? str:
- """
- Clean raw text by removing unnecessary whitespace and artifacts.
-
- Args:
- text: Raw text to clean
-
- Returns:
- Cleaned text
- """
- if not text:
- return ""
-
- # Remove image references like 
- text = re.sub(r'!\[.*?\]\(data:image/[^)]+\)', '', text)
-
- # Remove basic markdown image references like 
- text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
-
- # Remove base64 encoded image data
- text = re.sub(r'data:image/[^;]+;base64,[a-zA-Z0-9+/=]+', '', text)
-
- # Clean up any JSON-like image object references
- text = re.sub(r'{"image(_data)?":("[^"]*"|null|true|false|\{[^}]*\}|\[[^\]]*\])}', '', text)
-
- # Clean up excessive whitespace and line breaks created by removals
- text = re.sub(r'\n{3,}', '\n\n', text)
- text = re.sub(r'\s{3,}', ' ', text)
-
- return text.strip()
-
-def detect_content_regions(image_np):
- """
- Detect content regions based on text density analysis.
- Returns regions with adaptive overlapping.
-
- Args:
- image_np: Numpy array image
-
- Returns:
- list: List of region tuples (x, y, width, height)
- """
- # Import necessary modules
- import numpy as np
- import cv2
-
- # Convert to grayscale for text detection
- if len(image_np.shape) > 2 and image_np.shape[2] == 3:
- gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
- else:
- gray = image_np
-
- # Create text density profile
- # Sum pixel values horizontally to get vertical text density
- v_profile = np.sum(255 - gray, axis=1)
-
- # Normalize the profile
- v_profile = v_profile / np.max(v_profile) if np.max(v_profile) > 0 else v_profile
-
- # Find significant density changes
- changes = []
- threshold = 0.2
- for i in range(1, len(v_profile)):
- if abs(v_profile[i] - v_profile[i-1]) > threshold:
- changes.append(i)
-
- # Create adaptive regions based on density changes
- img_height, img_width = gray.shape
-
- # Default to at least 3 regions with overlap
- if len(changes) < 2:
- # If no significant changes, use default division with overlapping regions
- header_height = int(img_height * 0.3)
- middle_start = int(img_height * 0.2)
- middle_height = int(img_height * 0.4)
- body_start = int(img_height * 0.5)
- body_height = img_height - body_start
- else:
- # Use detected density changes for more precise regions
- changes = sorted(changes)
- header_height = changes[0] + int(img_height * 0.05) # Add overlap
- middle_start = max(0, changes[0] - int(img_height * 0.05))
-
- if len(changes) > 1:
- middle_height = (changes[1] - middle_start) + int(img_height * 0.05)
- body_start = max(0, changes[1] - int(img_height * 0.05))
- else:
- middle_height = int(img_height * 0.4)
- body_start = int(img_height * 0.5)
-
- body_height = img_height - body_start
-
- # Define regions with adaptive overlap
- regions = [
- (0, 0, img_width, header_height), # Header region
- (0, middle_start, img_width, middle_height), # Middle region with overlap
- (0, body_start, img_width, body_height) # Body region with overlap
- ]
-
- return regions
-
-def merge_region_texts(regions: List[Dict[str, Any]], min_similarity_threshold: float = 0.7) -> str:
- """
- Intelligently merge text from multiple document regions, handling overlapping content.
- Uses text similarity detection to avoid duplicating content from overlapping regions.
-
- Args:
- regions: List of region dictionaries, each containing 'text' and 'order' keys
- min_similarity_threshold: Minimum similarity ratio to consider text as duplicate
-
- Returns:
- Merged text with duplications removed
- """
- # If no regions, return empty string
- if not regions:
- return ""
-
- # If only one region, return its text directly
- if len(regions) == 1:
- return regions[0]['text']
-
- # Sort regions by their defined order
- sorted_regions = sorted(regions, key=lambda x: x.get('order', 0))
-
- # Extract text segments from each region
- texts = [region.get('text', '').strip() for region in sorted_regions]
-
- # Remove empty texts
- texts = [t for t in texts if t]
-
- if not texts:
- return ""
-
- # Start with the first region's text
- merged_text = texts[0]
-
- # Process each subsequent region
- for i in range(1, len(texts)):
- current_text = texts[i]
-
- # Skip if current text is empty
- if not current_text:
- continue
-
- # Find potential overlap with existing merged text
- # Split both texts into lines for line-by-line comparison
- merged_lines = merged_text.splitlines()
- current_lines = current_text.splitlines()
-
- # Initialize variables to track where to start appending
- append_from_line = 0 # Default: append all lines from current text
- max_similarity = 0.0
- max_similarity_pos = -1
-
- # Check for potential line duplications
- # Look at the last N lines of merged text (N = min(20, len(merged_lines)))
- # to see if they match the first N lines of current text
- check_lines = min(20, len(merged_lines))
- for j in range(1, check_lines + 1):
- # Get the last j lines from merged text
- merged_end = "\n".join(merged_lines[-j:])
-
- # Get the first j lines from current text
- current_start = "\n".join(current_lines[:j])
-
- # Skip comparison if either section is too short
- if len(merged_end) < 10 or len(current_start) < 10:
- continue
-
- # Calculate similarity ratio
- similarity = difflib.SequenceMatcher(None, merged_end, current_start).ratio()
-
- # If we found a better match, update
- if similarity > max_similarity and similarity >= min_similarity_threshold:
- max_similarity = similarity
- max_similarity_pos = j
-
- # If we found a good match, skip those lines from current text
- if max_similarity_pos > 0:
- logger.info(f"Found overlapping text with similarity {max_similarity:.2f}, skipping {max_similarity_pos} lines")
- append_from_line = max_similarity_pos
-
- # Append non-duplicated content with a separator
- if append_from_line < len(current_lines):
- remaining_text = "\n".join(current_lines[append_from_line:])
- if remaining_text.strip():
- merged_text += "\n\n" + remaining_text
-
- return merged_text
diff --git a/utils/ui_utils.py b/utils/ui_utils.py
deleted file mode 100644
index 2738909aaa3fb27d51beb3c9fc6661f3f1ec5df8..0000000000000000000000000000000000000000
--- a/utils/ui_utils.py
+++ /dev/null
@@ -1,351 +0,0 @@
-"""
-UI utilities for OCR results display.
-"""
-import os
-import streamlit as st
-import json
-import base64
-import io
-from datetime import datetime
-
-from utils.text_utils import format_ocr_text
-from utils.content_utils import classify_document_content, format_structured_data
-
-def display_results(result, container, custom_prompt=""):
- """Display OCR results in the provided container"""
- with container:
- # Add heading for document metadata
- st.markdown("### Document Metadata")
-
- # Filter out large data structures from metadata display
- meta = {k: v for k, v in result.items()
- if k not in ['pages_data', 'illustrations', 'ocr_contents', 'raw_response_data']}
-
- # Create a compact metadata section for primary metadata
- meta_html = '
'
-
- # Document type
- if 'detected_document_type' in meta:
- meta_html += f'
Type: {meta["detected_document_type"]}
'
-
- # Page information
- if 'limited_pages' in meta:
- meta_html += f'
'
- st.markdown(meta_html, unsafe_allow_html=True)
-
- # Processing time - separate section for proper ordering of all metadata fields
- if 'processing_time' in meta:
- time_html = '
'
- time_html += '
Time:
'
- time_html += f'
{meta["processing_time"]:.1f}s
'
- time_html += '
'
- st.markdown(time_html, unsafe_allow_html=True)
-
- # Language metadata on a separate line, Subject Tags below
-
- # First show languages if available
- if 'languages' in result and result['languages']:
- languages = [lang for lang in result['languages'] if lang is not None]
- if languages:
- # Create a dedicated line for Languages
- lang_html = '
'
- lang_html += '
Language:
'
-
- # Add language tags
- for lang in languages:
- # Clean language name if needed
- clean_lang = str(lang).strip()
- if clean_lang: # Only add if not empty
- lang_html += f'{clean_lang}'
-
- lang_html += '
'
- st.markdown(lang_html, unsafe_allow_html=True)
-
- # Prepare download files
- try:
- # Get base filename
- from utils.general_utils import create_descriptive_filename
- original_file = result.get('file_name', 'document')
- base_name = create_descriptive_filename(original_file, result, "")
- base_name = os.path.splitext(base_name)[0]
-
- # 1. JSON download - with base64 data truncated for readability
- from utils.image_utils import truncate_base64_in_result
- truncated_result = truncate_base64_in_result(result)
- json_str = json.dumps(truncated_result, indent=2)
- json_filename = f"{base_name}.json"
- json_b64 = base64.b64encode(json_str.encode()).decode()
-
- # 2. Create ZIP with all files
- from utils.image_utils import create_results_zip_in_memory
- zip_data = create_results_zip_in_memory(result)
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- zip_filename = f"{base_name}_{timestamp}.zip"
- zip_b64 = base64.b64encode(zip_data).decode()
-
- # Add download line with metadata styling
- download_html = '
'
- download_html += '
Download:
'
-
- # Download links in order of importance, matching the zip file contents
- download_html += f'JSON'
-
- # Zip download link (packages everything together)
- download_html += f'Zip Archive'
-
- download_html += '
'
- st.markdown(download_html, unsafe_allow_html=True)
- except Exception as e:
- # Silent fail for downloads - don't disrupt the UI
- pass
-
- # Create a separate line for Time if we have time-related tags
- if 'topics' in result and result['topics']:
- time_tags = [topic for topic in result['topics']
- if any(term in topic.lower() for term in ["century", "pre-", "era"])]
- if time_tags:
- time_html = '
'
- time_html += '
Time:
'
- for tag in time_tags:
- time_html += f'{tag}'
- time_html += '
'
- st.markdown(time_html, unsafe_allow_html=True)
-
- # Then display remaining subject tags if available
- if 'topics' in result and result['topics']:
- # Filter out time-related tags which are already displayed
- subject_tags = [topic for topic in result['topics']
- if not any(term in topic.lower() for term in ["century", "pre-", "era"])]
-
- if subject_tags:
- # Create a separate line for Subject Tags
- tags_html = '
'
- tags_html += '
Subject Tags:
'
- tags_html += '
'
-
- # Generate a badge for each remaining tag
- for topic in subject_tags:
- # Determine tag category class
- tag_class = "subject-tag" # Default class
-
- # Add specialized class based on category
- if any(term in topic.lower() for term in ["language", "english", "french", "german", "latin"]):
- tag_class += " tag-language" # Languages
- elif any(term in topic.lower() for term in ["letter", "newspaper", "book", "form", "document", "recipe"]):
- tag_class += " tag-document-type" # Document types
- elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
- tag_class += " tag-subject" # Subject domains
- elif "historical" in topic.lower() and "document" in topic.lower():
- tag_class += " tag-document-type" # "Historical Document Analysis" should be a document type
-
- # Add each tag as an inline span
- tags_html += f'{topic}'
-
- # Close the containers
- tags_html += '
'
-
- # Render the subject tags section
- st.markdown(tags_html, unsafe_allow_html=True)
-
- # Check if we have OCR content
- if 'ocr_contents' in result:
- # Create a single view instead of tabs
- content_tab1 = st.container()
-
- # Check for images in the result to use later
- has_images = result.get('has_images', False)
- has_image_data = ('pages_data' in result and any(page.get('images', []) for page in result.get('pages_data', [])))
- has_raw_images = ('raw_response_data' in result and 'pages' in result['raw_response_data'] and
- any('images' in page for page in result['raw_response_data']['pages']
- if isinstance(page, dict)))
-
- # Display structured content
- with content_tab1:
- # Display structured content with markdown formatting
- if isinstance(result['ocr_contents'], dict):
- # CSS is now handled in the main layout.py file
-
- # Collect all available images from the result
- available_images = []
- if has_images and 'pages_data' in result:
- for page_idx, page in enumerate(result['pages_data']):
- if 'images' in page and len(page['images']) > 0:
- for img_idx, img in enumerate(page['images']):
- if 'image_base64' in img:
- available_images.append({
- 'source': 'pages_data',
- 'page': page_idx,
- 'index': img_idx,
- 'data': img['image_base64']
- })
-
- # Get images from raw response as well
- if 'raw_response_data' in result:
- raw_data = result['raw_response_data']
- if isinstance(raw_data, dict) and 'pages' in raw_data:
- for page_idx, page in enumerate(raw_data['pages']):
- if isinstance(page, dict) and 'images' in page:
- for img_idx, img in enumerate(page['images']):
- if isinstance(img, dict) and 'base64' in img:
- available_images.append({
- 'source': 'raw_response',
- 'page': page_idx,
- 'index': img_idx,
- 'data': img['base64']
- })
-
- # Extract images for display at the top
- images_to_display = []
-
- # First, collect all available images
- for img_idx, img in enumerate(available_images):
- if 'data' in img:
- images_to_display.append({
- 'data': img['data'],
- 'id': img.get('id', f"img_{img_idx}"),
- 'index': img_idx
- })
-
- # Image display now only happens in the Images tab
-
- # Organize sections in a logical order - prioritize main_text
- section_order = ["title", "author", "date", "summary", "main_text", "content", "transcript", "metadata"]
- ordered_sections = []
-
- # Add known sections first in preferred order
- for section_name in section_order:
- if section_name in result['ocr_contents'] and result['ocr_contents'][section_name]:
- ordered_sections.append(section_name)
-
- # Add any remaining sections
- for section in result['ocr_contents'].keys():
- if (section not in ordered_sections and
- section not in ['error', 'partial_text'] and
- result['ocr_contents'][section]):
- ordered_sections.append(section)
-
- # If only raw_text is available and no other content, add it last
- if ('raw_text' in result['ocr_contents'] and
- result['ocr_contents']['raw_text'] and
- len(ordered_sections) == 0):
- ordered_sections.append('raw_text')
-
- # Add minimal spacing before OCR results
- st.markdown("", unsafe_allow_html=True)
-
- # Create tabs for different views
- if has_images:
- tabs = st.tabs(["Document Content", "Raw JSON", "Images"])
- doc_tab, json_tab, img_tab = tabs
- else:
- tabs = st.tabs(["Document Content", "Raw JSON"])
- doc_tab, json_tab = tabs
- img_tab = None
-
- # Document Content tab with simple, clean formatting that matches markdown export files
- with doc_tab:
- # Create a single unified content section
- st.markdown("## Text Content")
-
- # Present content directly in the format used in markdown export files
- if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
- # Get all content fields that should be displayed
- content_fields = {}
-
- # Add all available content fields (left_page, right_page, etc)
- for field, content in result['ocr_contents'].items():
- # Skip certain fields that shouldn't be displayed
- if field in ['error', 'partial_text'] or not content:
- continue
-
- # Clean the content if it's a string
- if isinstance(content, str) and content.strip():
- content_fields[field] = content.strip()
- # Handle dictionary or list content
- elif isinstance(content, (dict, list)):
- formatted_content = format_structured_data(content)
- if formatted_content:
- content_fields[field] = formatted_content
-
- # Process nested dictionary structures
- def flatten_content_fields(fields, parent_key=""):
- flat_fields = {}
- for field, content in fields.items():
- # Skip certain fields
- if field in ['error', 'partial_text'] or not content:
- continue
-
- # Handle string content
- if isinstance(content, str) and content.strip():
- key = f"{parent_key}_{field}".strip("_")
- flat_fields[key] = content.strip()
- # Handle dictionary content
- elif isinstance(content, dict):
- # If the dictionary has a 'text' key, extract just that value
- if 'text' in content and isinstance(content['text'], str):
- key = f"{parent_key}_{field}".strip("_")
- flat_fields[key] = content['text'].strip()
- # Otherwise, recursively process nested dictionaries
- else:
- nested_fields = flatten_content_fields(content, f"{parent_key}_{field}")
- flat_fields.update(nested_fields)
- # Handle list content
- elif isinstance(content, list):
- formatted_content = format_structured_data(content)
- if formatted_content:
- key = f"{parent_key}_{field}".strip("_")
- flat_fields[key] = formatted_content
-
- return flat_fields
-
- # Flatten the content structure
- flat_content_fields = flatten_content_fields(result['ocr_contents'])
-
- # Display the flattened content fields with proper formatting
- for field, content in flat_content_fields.items():
- # Skip any empty content
- if not content or not content.strip():
- continue
-
- # Format field name as in the markdown export
- field_display = field.replace('_', ' ')
-
- # Maintain content purity - don't parse text content as JSON
- # Historical text may contain curly braces that aren't JSON
-
- # For raw_text field, display only the content without the field name
- if field == 'raw_text':
- st.markdown(f"{content}")
- else:
- # For other fields, display the field name in bold followed by the content
- st.markdown(f"**{field}:** {content}")
-
- # Add spacing between fields
- st.markdown("\n\n")
-
- # Raw JSON tab - displays the exact same JSON that's downloaded via the JSON button
- with json_tab:
- # Use the same truncated JSON that's used in the download button
- from utils.image_utils import truncate_base64_in_result
- truncated_result = truncate_base64_in_result(result)
-
- # Format the JSON prettily
- json_str = json.dumps(truncated_result, indent=2)
-
- # Display JSON with a copy button using Streamlit's built-in functionality
- st.json(truncated_result)
-
-
- # Images tab - for viewing document images
- if has_images and img_tab:
- with img_tab:
- # Display each available image
- for i, img in enumerate(images_to_display):
- st.image(img['data'], caption=f"Image {i+1}", use_container_width=True)
-
- # Display custom prompt if provided
- if custom_prompt:
- with st.expander("Custom Processing Instructions"):
- st.write(custom_prompt)