Spaces:

milwright
/

historical-ocr

Running

File size: 5,203 Bytes

85bdb4e

import streamlit as st
from pathlib import Path
from layout import gray_container, tool_container, key_concept, quote

def render():
    """Module 4: Methodological Approaches"""
    
    st.title("Module 4: Methodological Approaches")
    
    col1, col2 = st.columns([1, 1])
    
    with col1:
        hybrid_content = """
        <h3>Hybrid Methodologies</h3>
        
        <h4>1. Computational + Human Reading</h4>
        <ul>
            <li>OCR for initial processing and discovery</li>
            <li>Human review for context and interpretation</li>
            <li>Iterative refinement of computational outputs</li>
        </ul>
        
        <h4>2. Close + Distant Reading</h4>
        <ul>
            <li>Distant reading through large-scale OCR processing</li>
            <li>Close reading of selected passages</li>
            <li>Zooming between scales of analysis</li>
        </ul>
        """
        gray_container(hybrid_content)
        
        # Check if the diagram image is available and display it
        input_dir = Path(__file__).parent.parent / "input"
        diagram_path = input_dir / "diagram.jpg"
        
        if diagram_path.exists():
            try:
                from PIL import Image
                with Image.open(diagram_path) as img:
                    st.image(img, caption="Historical VLM architecture", use_column_width=True)
            except Exception:
                # If there's an error, just show a placeholder
                st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
        else:
            # If the file doesn't exist, show a placeholder
            st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
    
    with col2:
        mistral_content = """
        <h3>Mistral-OCR-Latest: State-of-the-Art</h3>
        
        <p>The Mistral-OCR model represents a significant advancement:</p>
        <ul>
            <li><strong>Multimodal Understanding</strong>: Processes both visual and textual information</li>
            <li><strong>Contextual Awareness</strong>: Considers historical context</li>
            <li><strong>Layout Recognition</strong>: Preserves complex document structures</li>
            <li><strong>Historical Font Adaptation</strong>: Trained on diverse historical typography</li>
        </ul>
        """
        gray_container(mistral_content)
        
        # Check if the workflow image is available and display it
        workflow_path = input_dir / "workflow.jpg"
        
        if workflow_path.exists():
            try:
                from PIL import Image
                with Image.open(workflow_path) as img:
                    st.image(img, caption="Mistral OCR workflow", use_column_width=True)
            except Exception:
                # If there's an error, just show a placeholder
                st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
        else:
            # If the file doesn't exist, show a placeholder
            st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
    
    # Practical workflow section
    workflow_content = """
    <h3>Practical Workflow</h3>
    
    <p>A typical historical OCR workflow with Mistral-OCR includes:</p>
    <ol>
        <li><strong>Selection</strong>: Choosing appropriate documents</li>
        <li><strong>Preprocessing</strong>: Enhancing images before OCR</li>
        <li><strong>OCR Processing</strong>: Running documents through vision-enhanced OCR</li>
        <li><strong>Post-processing</strong>: Cleaning up outputs and structured extraction</li>
        <li><strong>Verification</strong>: Cross-checking results against originals</li>
        <li><strong>Integration</strong>: Incorporating OCR outputs into research materials</li>
    </ol>
    """
    tool_container(workflow_content)
    
    # Methodological considerations
    st.subheader("Methodological Considerations")
    
    col1, col2 = st.columns([1, 1])
    
    with col1:
        advantages_content = """
        <h4>Advantages of Hybrid Approaches</h4>
        <ul>
            <li>Balance between automation and expert judgment</li>
            <li>Ability to process large volumes while preserving detail</li>
            <li>Context-sensitive analysis of complex documents</li>
            <li>Iterative improvement of results</li>
        </ul>
        """
        gray_container(advantages_content)
    
    with col2:
        limitations_content = """
        <h4>Limitations and Challenges</h4>
        <ul>
            <li>OCR errors requiring expert correction</li>
            <li>Bias in training data affecting recognition</li>
            <li>Complexity in evaluating OCR quality</li>
            <li>Technical infrastructure requirements</li>
        </ul>
        """
        gray_container(limitations_content)
    
    # Quote
    quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
    quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")