Spaces:
Running
Running
| import streamlit as st | |
| from pathlib import Path | |
| from layout import gray_container, tool_container, key_concept, quote | |
| def render(): | |
| """Module 4: Methodological Approaches""" | |
| st.title("Module 4: Methodological Approaches") | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| hybrid_content = """ | |
| <h3>Hybrid Methodologies</h3> | |
| <h4>1. Computational + Human Reading</h4> | |
| <ul> | |
| <li>OCR for initial processing and discovery</li> | |
| <li>Human review for context and interpretation</li> | |
| <li>Iterative refinement of computational outputs</li> | |
| </ul> | |
| <h4>2. Close + Distant Reading</h4> | |
| <ul> | |
| <li>Distant reading through large-scale OCR processing</li> | |
| <li>Close reading of selected passages</li> | |
| <li>Zooming between scales of analysis</li> | |
| </ul> | |
| """ | |
| gray_container(hybrid_content) | |
| # Check if the diagram image is available and display it | |
| input_dir = Path(__file__).parent.parent / "input" | |
| diagram_path = input_dir / "diagram.jpg" | |
| if diagram_path.exists(): | |
| try: | |
| from PIL import Image | |
| with Image.open(diagram_path) as img: | |
| st.image(img, caption="Historical VLM architecture", use_column_width=True) | |
| except Exception: | |
| # If there's an error, just show a placeholder | |
| st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder") | |
| else: | |
| # If the file doesn't exist, show a placeholder | |
| st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder") | |
| with col2: | |
| mistral_content = """ | |
| <h3>Mistral-OCR-Latest: State-of-the-Art</h3> | |
| <p>The Mistral-OCR model represents a significant advancement:</p> | |
| <ul> | |
| <li><strong>Multimodal Understanding</strong>: Processes both visual and textual information</li> | |
| <li><strong>Contextual Awareness</strong>: Considers historical context</li> | |
| <li><strong>Layout Recognition</strong>: Preserves complex document structures</li> | |
| <li><strong>Historical Font Adaptation</strong>: Trained on diverse historical typography</li> | |
| </ul> | |
| """ | |
| gray_container(mistral_content) | |
| # Check if the workflow image is available and display it | |
| workflow_path = input_dir / "workflow.jpg" | |
| if workflow_path.exists(): | |
| try: | |
| from PIL import Image | |
| with Image.open(workflow_path) as img: | |
| st.image(img, caption="Mistral OCR workflow", use_column_width=True) | |
| except Exception: | |
| # If there's an error, just show a placeholder | |
| st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder") | |
| else: | |
| # If the file doesn't exist, show a placeholder | |
| st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder") | |
| # Practical workflow section | |
| workflow_content = """ | |
| <h3>Practical Workflow</h3> | |
| <p>A typical historical OCR workflow with Mistral-OCR includes:</p> | |
| <ol> | |
| <li><strong>Selection</strong>: Choosing appropriate documents</li> | |
| <li><strong>Preprocessing</strong>: Enhancing images before OCR</li> | |
| <li><strong>OCR Processing</strong>: Running documents through vision-enhanced OCR</li> | |
| <li><strong>Post-processing</strong>: Cleaning up outputs and structured extraction</li> | |
| <li><strong>Verification</strong>: Cross-checking results against originals</li> | |
| <li><strong>Integration</strong>: Incorporating OCR outputs into research materials</li> | |
| </ol> | |
| """ | |
| tool_container(workflow_content) | |
| # Methodological considerations | |
| st.subheader("Methodological Considerations") | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| advantages_content = """ | |
| <h4>Advantages of Hybrid Approaches</h4> | |
| <ul> | |
| <li>Balance between automation and expert judgment</li> | |
| <li>Ability to process large volumes while preserving detail</li> | |
| <li>Context-sensitive analysis of complex documents</li> | |
| <li>Iterative improvement of results</li> | |
| </ul> | |
| """ | |
| gray_container(advantages_content) | |
| with col2: | |
| limitations_content = """ | |
| <h4>Limitations and Challenges</h4> | |
| <ul> | |
| <li>OCR errors requiring expert correction</li> | |
| <li>Bias in training data affecting recognition</li> | |
| <li>Complexity in evaluating OCR quality</li> | |
| <li>Technical infrastructure requirements</li> | |
| </ul> | |
| """ | |
| gray_container(limitations_content) | |
| # Quote | |
| quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry." | |
| quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar") |