milwright's picture
submit pull for merge
85bdb4e verified
raw
history blame
4.61 kB
import streamlit as st
from pathlib import Path
from layout import gray_container, tool_container, key_concept, research_question
def render():
"""Module 3: OCR Technology and Historical Documents"""
st.title("Module 3: OCR Technology and Historical Documents")
col1, col2 = st.columns([1, 1])
with col1:
traditional_content = """
<h3>Traditional OCR Approaches</h3>
<ol>
<li><strong>Pattern Matching</strong>: Early OCR compared characters to templates</li>
<li><strong>Feature Extraction</strong>: Identifying key features of characters</li>
<li><strong>Statistical Models</strong>: Using probabilities to improve recognition</li>
</ol>
"""
gray_container(traditional_content)
modern_content = """
<h3>Modern AI-Enhanced OCR</h3>
<ol>
<li><strong>Neural Networks</strong>: Deep learning models trained on vast datasets</li>
<li><strong>Computer Vision</strong>: Advanced image processing techniques</li>
<li><strong>Language Models</strong>: Contextual understanding to resolve ambiguities</li>
<li><strong>Multimodal Models</strong>: Integration of text, layout, and visual understanding</li>
</ol>
"""
gray_container(modern_content)
with col2:
challenges_content = """
<h3>Challenges with Historical Documents</h3>
<p>Historical materials present unique difficulties:</p>
<ul>
<li><strong>Typography Variation</strong>: Non-standardized fonts and styles</li>
<li><strong>Historical Language</strong>: Archaic vocabulary and grammar</li>
<li><strong>Layout Complexity</strong>: Non-linear arrangements</li>
<li><strong>Document Degradation</strong>: Fading, tears, stains, and damage</li>
<li><strong>Material Artifacts</strong>: Paper texture, binding shadows, etc.</li>
</ul>
"""
gray_container(challenges_content)
# Display OCR processing diagram
st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg",
caption="OCR processing layers")
# Key concept section
concept_content = """
<h3>Vision-Enhanced OCR</h3>
<p>Modern OCR systems like those based on Mistral-7B-Vision combine:</p>
<ol>
<li>Image understanding capabilities to process the visual aspects</li>
<li>Text recognition to extract characters accurately</li>
<li>Layout analysis to understand structure</li>
<li>Contextual language processing for improved accuracy</li>
</ol>
<p>This multimodal approach dramatically improves OCR results on historical documents compared to traditional OCR.</p>
"""
key_concept(concept_content)
# Technical details in a tool container
tech_content = """
<h3>Technical Evolution of OCR</h3>
<p><strong>Traditional OCR Pipeline:</strong></p>
<ol>
<li>Preprocessing (binarization, noise removal)</li>
<li>Layout analysis (segmentation)</li>
<li>Character recognition (pattern matching)</li>
<li>Post-processing (spell checking)</li>
</ol>
<p><strong>Modern LLM-Vision Pipeline:</strong></p>
<ol>
<li>Image normalization</li>
<li>Image embedding via vision encoder</li>
<li>Integration with language model</li>
<li>Joint inference across modalities</li>
<li>Structured extraction of information</li>
</ol>
"""
tool_container(tech_content)
# Research question
research_content = """
<h4>Consider This:</h4>
<p>How might the capabilities of vision-language models change our approach to digitizing historical archives?</p>
"""
research_question(research_content)
# Display history if available
if 'processing_history' in st.session_state and st.session_state.processing_history:
with st.expander("Your OCR Processing History"):
st.markdown("You've already processed the following documents:")
for item in st.session_state.processing_history:
st.markdown(f"**{item['fileName']}**")
col1, col2 = st.columns(2)
with col1:
st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
with col2:
st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")