milwright's picture
submit pull for merge
85bdb4e verified
raw
history blame
5.2 kB
import streamlit as st
from pathlib import Path
from layout import gray_container, tool_container, key_concept, quote
def render():
"""Module 4: Methodological Approaches"""
st.title("Module 4: Methodological Approaches")
col1, col2 = st.columns([1, 1])
with col1:
hybrid_content = """
<h3>Hybrid Methodologies</h3>
<h4>1. Computational + Human Reading</h4>
<ul>
<li>OCR for initial processing and discovery</li>
<li>Human review for context and interpretation</li>
<li>Iterative refinement of computational outputs</li>
</ul>
<h4>2. Close + Distant Reading</h4>
<ul>
<li>Distant reading through large-scale OCR processing</li>
<li>Close reading of selected passages</li>
<li>Zooming between scales of analysis</li>
</ul>
"""
gray_container(hybrid_content)
# Check if the diagram image is available and display it
input_dir = Path(__file__).parent.parent / "input"
diagram_path = input_dir / "diagram.jpg"
if diagram_path.exists():
try:
from PIL import Image
with Image.open(diagram_path) as img:
st.image(img, caption="Historical VLM architecture", use_column_width=True)
except Exception:
# If there's an error, just show a placeholder
st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
else:
# If the file doesn't exist, show a placeholder
st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
with col2:
mistral_content = """
<h3>Mistral-OCR-Latest: State-of-the-Art</h3>
<p>The Mistral-OCR model represents a significant advancement:</p>
<ul>
<li><strong>Multimodal Understanding</strong>: Processes both visual and textual information</li>
<li><strong>Contextual Awareness</strong>: Considers historical context</li>
<li><strong>Layout Recognition</strong>: Preserves complex document structures</li>
<li><strong>Historical Font Adaptation</strong>: Trained on diverse historical typography</li>
</ul>
"""
gray_container(mistral_content)
# Check if the workflow image is available and display it
workflow_path = input_dir / "workflow.jpg"
if workflow_path.exists():
try:
from PIL import Image
with Image.open(workflow_path) as img:
st.image(img, caption="Mistral OCR workflow", use_column_width=True)
except Exception:
# If there's an error, just show a placeholder
st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
else:
# If the file doesn't exist, show a placeholder
st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
# Practical workflow section
workflow_content = """
<h3>Practical Workflow</h3>
<p>A typical historical OCR workflow with Mistral-OCR includes:</p>
<ol>
<li><strong>Selection</strong>: Choosing appropriate documents</li>
<li><strong>Preprocessing</strong>: Enhancing images before OCR</li>
<li><strong>OCR Processing</strong>: Running documents through vision-enhanced OCR</li>
<li><strong>Post-processing</strong>: Cleaning up outputs and structured extraction</li>
<li><strong>Verification</strong>: Cross-checking results against originals</li>
<li><strong>Integration</strong>: Incorporating OCR outputs into research materials</li>
</ol>
"""
tool_container(workflow_content)
# Methodological considerations
st.subheader("Methodological Considerations")
col1, col2 = st.columns([1, 1])
with col1:
advantages_content = """
<h4>Advantages of Hybrid Approaches</h4>
<ul>
<li>Balance between automation and expert judgment</li>
<li>Ability to process large volumes while preserving detail</li>
<li>Context-sensitive analysis of complex documents</li>
<li>Iterative improvement of results</li>
</ul>
"""
gray_container(advantages_content)
with col2:
limitations_content = """
<h4>Limitations and Challenges</h4>
<ul>
<li>OCR errors requiring expert correction</li>
<li>Bias in training data affecting recognition</li>
<li>Complexity in evaluating OCR quality</li>
<li>Technical infrastructure requirements</li>
</ul>
"""
gray_container(limitations_content)
# Quote
quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")