Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / modules /content /module4.py

milwright

submit pull for merge

85bdb4e verified 11 months ago

raw

history blame

5.2 kB

	import streamlit as st
	from pathlib import Path
	from layout import gray_container, tool_container, key_concept, quote

	def render():
	"""Module 4: Methodological Approaches"""

	st.title("Module 4: Methodological Approaches")

	col1, col2 = st.columns([1, 1])

	with col1:
	hybrid_content = """
	<h3>Hybrid Methodologies</h3>

	<h4>1. Computational + Human Reading</h4>
	<ul>
	<li>OCR for initial processing and discovery</li>
	<li>Human review for context and interpretation</li>
	<li>Iterative refinement of computational outputs</li>
	</ul>

	<h4>2. Close + Distant Reading</h4>
	<ul>
	<li>Distant reading through large-scale OCR processing</li>
	<li>Close reading of selected passages</li>
	<li>Zooming between scales of analysis</li>
	</ul>
	"""
	gray_container(hybrid_content)

	# Check if the diagram image is available and display it
	input_dir = Path(__file__).parent.parent / "input"
	diagram_path = input_dir / "diagram.jpg"

	if diagram_path.exists():
	try:
	from PIL import Image
	with Image.open(diagram_path) as img:
	st.image(img, caption="Historical VLM architecture", use_column_width=True)
	except Exception:
	# If there's an error, just show a placeholder
	st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
	else:
	# If the file doesn't exist, show a placeholder
	st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")

	with col2:
	mistral_content = """
	<h3>Mistral-OCR-Latest: State-of-the-Art</h3>

	<p>The Mistral-OCR model represents a significant advancement:</p>
	<ul>
	<li><strong>Multimodal Understanding</strong>: Processes both visual and textual information</li>
	<li><strong>Contextual Awareness</strong>: Considers historical context</li>
	<li><strong>Layout Recognition</strong>: Preserves complex document structures</li>
	<li><strong>Historical Font Adaptation</strong>: Trained on diverse historical typography</li>
	</ul>
	"""
	gray_container(mistral_content)

	# Check if the workflow image is available and display it
	workflow_path = input_dir / "workflow.jpg"

	if workflow_path.exists():
	try:
	from PIL import Image
	with Image.open(workflow_path) as img:
	st.image(img, caption="Mistral OCR workflow", use_column_width=True)
	except Exception:
	# If there's an error, just show a placeholder
	st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
	else:
	# If the file doesn't exist, show a placeholder
	st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")

	# Practical workflow section
	workflow_content = """
	<h3>Practical Workflow</h3>

	<p>A typical historical OCR workflow with Mistral-OCR includes:</p>
	<ol>
	<li><strong>Selection</strong>: Choosing appropriate documents</li>
	<li><strong>Preprocessing</strong>: Enhancing images before OCR</li>
	<li><strong>OCR Processing</strong>: Running documents through vision-enhanced OCR</li>
	<li><strong>Post-processing</strong>: Cleaning up outputs and structured extraction</li>
	<li><strong>Verification</strong>: Cross-checking results against originals</li>
	<li><strong>Integration</strong>: Incorporating OCR outputs into research materials</li>
	</ol>
	"""
	tool_container(workflow_content)

	# Methodological considerations
	st.subheader("Methodological Considerations")

	col1, col2 = st.columns([1, 1])

	with col1:
	advantages_content = """
	<h4>Advantages of Hybrid Approaches</h4>
	<ul>
	<li>Balance between automation and expert judgment</li>
	<li>Ability to process large volumes while preserving detail</li>
	<li>Context-sensitive analysis of complex documents</li>
	<li>Iterative improvement of results</li>
	</ul>
	"""
	gray_container(advantages_content)

	with col2:
	limitations_content = """
	<h4>Limitations and Challenges</h4>
	<ul>
	<li>OCR errors requiring expert correction</li>
	<li>Bias in training data affecting recognition</li>
	<li>Complexity in evaluating OCR quality</li>
	<li>Technical infrastructure requirements</li>
	</ul>
	"""
	gray_container(limitations_content)

	# Quote
	quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
	quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")