Spaces:
Running
Running
| import streamlit as st | |
| import io | |
| import tempfile | |
| from pathlib import Path | |
| from datetime import datetime | |
| from layout import tool_container, key_concept, research_question, upload_container | |
| import sys | |
| # Import the necessary modules for OCR processing | |
| sys.path.append(str(Path(__file__).parent.parent)) | |
| try: | |
| from process_file import process_file as process_file_util | |
| process_file = process_file_util | |
| except ImportError: | |
| # Fallback if process_file is not available | |
| def process_file(uploaded_file, use_vision=True, custom_prompt=None): | |
| """Fallback function for processing files""" | |
| st.warning("Using mock processing function. Real OCR functionality is not available.") | |
| return { | |
| "file_name": uploaded_file.name, | |
| "languages": ["English"], | |
| "topics": ["History", "Document"], | |
| "ocr_contents": { | |
| "content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}" | |
| } | |
| } | |
| def render(): | |
| """Module 5: Interactive OCR Experiment""" | |
| st.title("Module 5: Interactive OCR Experiment") | |
| # Introduction to the interactive experiment | |
| intro_content = """ | |
| <h3>Interactive OCR Experiment</h3> | |
| <p> | |
| This interactive experiment allows you to process historical documents with OCR and analyze the results. | |
| Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology. | |
| </p> | |
| """ | |
| st.markdown(intro_content, unsafe_allow_html=True) | |
| # Create tabs for different activities | |
| experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"]) | |
| # Try to import PDF tools if available | |
| try: | |
| from pdf2image import convert_from_bytes | |
| pdf_support = True | |
| except ImportError: | |
| pdf_support = False | |
| st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.") | |
| with experiment_tab: | |
| # Create a two-column layout | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| # Tool container for document selection and options | |
| st.subheader("Step 1: Select Document & Options") | |
| # Processing options | |
| use_vision = st.checkbox("Use Vision Model", value=True, | |
| help="Use the vision model for improved analysis") | |
| # Additional prompt | |
| st.markdown("### Custom Research Prompt (Optional)") | |
| st.markdown("""Provide additional instructions to guide the OCR analysis. | |
| Focus on specific aspects of historical research you're interested in.""") | |
| custom_prompt = st.text_area("Research Prompt", | |
| placeholder="E.g., Focus on identifying dates and historical figures...", | |
| help="Optional instructions to guide the analysis") | |
| # Sample document selection | |
| input_dir = Path(__file__).parent.parent / "input" | |
| if input_dir.exists(): | |
| sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf")) | |
| if sample_files: | |
| st.markdown("#### Sample Documents") | |
| sample_options = ["Upload my own document"] + [f.name for f in sample_files] | |
| sample_choice = st.selectbox("Choose a document:", sample_options) | |
| if sample_choice != "Upload my own document": | |
| # Process the selected sample file | |
| selected_file = next((f for f in sample_files if f.name == sample_choice), None) | |
| if selected_file: | |
| # Store the selected sample file in session state | |
| with open(selected_file, "rb") as f: | |
| file_bytes = f.read() | |
| st.session_state.sample_file = { | |
| "name": selected_file.name, | |
| "bytes": file_bytes | |
| } | |
| # Preview the selected sample | |
| if selected_file.suffix.lower() == ".pdf" and pdf_support: | |
| try: | |
| with st.spinner("Generating PDF preview..."): | |
| images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150) | |
| if images: | |
| st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True) | |
| except Exception: | |
| st.info(f"PDF selected: {selected_file.name}") | |
| else: | |
| # For images display directly | |
| try: | |
| from PIL import Image | |
| img = Image.open(io.BytesIO(file_bytes)) | |
| st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True) | |
| except Exception: | |
| st.info(f"Selected: {selected_file.name}") | |
| else: | |
| # Clear the sample file if "Upload my own" is selected | |
| if 'sample_file' in st.session_state: | |
| del st.session_state.sample_file | |
| # Display file uploader | |
| upload_html = """ | |
| <h4>Upload a document to get started</h4> | |
| <p>Supported formats: PDF, JPG, PNG</p> | |
| """ | |
| upload_container(upload_html) | |
| uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") | |
| if uploaded_file is not None: | |
| # Display preview of the uploaded file | |
| file_ext = Path(uploaded_file.name).suffix.lower() | |
| if file_ext == ".pdf" and pdf_support: | |
| try: | |
| # Convert first page of PDF to image for preview | |
| pdf_bytes = uploaded_file.getvalue() | |
| with st.spinner("Generating PDF preview..."): | |
| images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) | |
| if images: | |
| st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True) | |
| else: | |
| st.info(f"PDF uploaded: {uploaded_file.name}") | |
| except Exception: | |
| st.info(f"PDF uploaded: {uploaded_file.name}") | |
| elif file_ext != ".pdf": | |
| st.image(uploaded_file, use_column_width=True) | |
| else: | |
| st.info(f"PDF uploaded: {uploaded_file.name}") | |
| else: | |
| # No sample files, just show the uploader | |
| upload_html = """ | |
| <h4>Upload a document to get started</h4> | |
| <p>Supported formats: PDF, JPG, PNG</p> | |
| """ | |
| upload_container(upload_html) | |
| uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") | |
| if uploaded_file is not None: | |
| # Display the file preview | |
| file_ext = Path(uploaded_file.name).suffix.lower() | |
| if file_ext == ".pdf" and pdf_support: | |
| try: | |
| pdf_bytes = uploaded_file.getvalue() | |
| with st.spinner("Generating PDF preview..."): | |
| images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) | |
| if images: | |
| st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True) | |
| except Exception: | |
| st.info(f"PDF uploaded: {uploaded_file.name}") | |
| elif file_ext != ".pdf": | |
| st.image(uploaded_file, use_column_width=True) | |
| else: | |
| st.info(f"PDF uploaded: {uploaded_file.name}") | |
| else: | |
| # No input directory | |
| upload_html = """ | |
| <h4>Upload a document to get started</h4> | |
| <p>Supported formats: PDF, JPG, PNG</p> | |
| """ | |
| upload_container(upload_html) | |
| uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") | |
| # Process button | |
| st.subheader("Step 2: Process the Document") | |
| # Get the file to process (either uploaded or sample) | |
| file_to_process = None | |
| if 'sample_file' in st.session_state and sample_choice != "Upload my own document": | |
| # Create a FileUploader-like object from the sample file | |
| class SampleFileObject: | |
| def __init__(self, name, data): | |
| self.name = name | |
| self._data = data | |
| def getvalue(self): | |
| return self._data | |
| file_to_process = SampleFileObject( | |
| st.session_state.sample_file["name"], | |
| st.session_state.sample_file["bytes"] | |
| ) | |
| elif 'uploaded_file' in locals() and uploaded_file is not None: | |
| file_to_process = uploaded_file | |
| # Process button | |
| process_button = st.button( | |
| "Process Document", | |
| disabled=file_to_process is None, | |
| use_container_width=True | |
| ) | |
| if process_button and file_to_process is not None: | |
| with st.spinner("Processing document..."): | |
| try: | |
| # Process the file | |
| result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None) | |
| if result: | |
| st.success("Document processed successfully!") | |
| # Store result in session state for display in the right column | |
| st.session_state.current_result = result | |
| # Add to processing history | |
| history_item = { | |
| "id": datetime.now().timestamp(), | |
| "fileName": file_to_process.name, | |
| "timestamp": datetime.now().isoformat(), | |
| "result": result, | |
| "useVision": use_vision | |
| } | |
| if 'processing_history' not in st.session_state: | |
| st.session_state.processing_history = [] | |
| st.session_state.processing_history.append(history_item) | |
| st.experimental_rerun() | |
| else: | |
| st.error("Failed to process document.") | |
| except Exception as e: | |
| st.error(f"Error processing document: {str(e)}") | |
| # Experiment instructions | |
| experiment_content = """ | |
| <h3>Experiment Instructions</h3> | |
| <ol> | |
| <li><strong>Step 1:</strong> Select a document and choose your options</li> | |
| <li><strong>Step 2:</strong> Process the document with the selected options</li> | |
| <li><strong>Step 3:</strong> Analyze the results in the panel on the right</li> | |
| <li><strong>Step 4:</strong> Try again with different settings (e.g., toggle vision model)</li> | |
| <li><strong>Step 5:</strong> Compare results between different runs</li> | |
| </ol> | |
| """ | |
| key_concept(experiment_content) | |
| with col2: | |
| # Results display | |
| st.subheader("Step 3: View Results") | |
| if 'current_result' in st.session_state and st.session_state.current_result: | |
| result = st.session_state.current_result | |
| # Display results in a tool container | |
| result_html = f""" | |
| <h4>Results for: {result.get('file_name', 'Unknown')}</h4> | |
| <p><strong>Languages:</strong> {', '.join(result.get('languages', ['Unknown']))}</p> | |
| <p><strong>Topics:</strong> {', '.join(result.get('topics', ['Unknown']))}</p> | |
| """ | |
| tool_container(result_html) | |
| # Create tabs for different views | |
| tab1, tab2 = st.tabs(["Structured View", "Raw JSON"]) | |
| with tab1: | |
| # Display in a more user-friendly format | |
| if 'ocr_contents' in result: | |
| if isinstance(result['ocr_contents'], dict): | |
| for section, content in result['ocr_contents'].items(): | |
| if content: # Only display non-empty sections | |
| st.markdown(f"#### {section.replace('_', ' ').title()}") | |
| if isinstance(content, str): | |
| st.markdown(content) | |
| elif isinstance(content, list): | |
| for item in content: | |
| if isinstance(item, str): | |
| st.markdown(f"- {item}") | |
| elif isinstance(item, dict): | |
| st.json(item) | |
| elif isinstance(content, dict): | |
| for k, v in content.items(): | |
| st.markdown(f"**{k}:** {v}") | |
| with tab2: | |
| # Show the raw JSON | |
| st.json(result) | |
| # Download options | |
| st.markdown("### Export Results") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Export as JSON | |
| import json | |
| json_bytes = json.dumps(result, indent=2).encode() | |
| st.download_button( | |
| label="Download JSON", | |
| data=json_bytes, | |
| file_name="ocr_results.json", | |
| mime="application/json", | |
| use_container_width=True | |
| ) | |
| with col2: | |
| # Export as text if content is available | |
| if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']: | |
| text_content = result['ocr_contents']['content'] | |
| st.download_button( | |
| label="Download Text", | |
| data=text_content.encode(), | |
| file_name="ocr_text.txt", | |
| mime="text/plain", | |
| use_container_width=True | |
| ) | |
| else: | |
| # Show placeholder when no results are available | |
| placeholder_html = """ | |
| <h4>Results will appear here</h4> | |
| <p>Upload and process a document to see the OCR results in this panel.</p> | |
| <p>The OCR tool will:</p> | |
| <ol> | |
| <li>Extract text from your document</li> | |
| <li>Identify languages and topics</li> | |
| <li>Provide structured content analysis</li> | |
| <li>Generate downloadable results</li> | |
| </ol> | |
| """ | |
| tool_container(placeholder_html) | |
| # Display processing history if available | |
| if 'processing_history' in st.session_state and st.session_state.processing_history: | |
| st.subheader("Step 4: Review Processing History") | |
| # Most recent result | |
| latest = st.session_state.processing_history[-1] | |
| latest_html = f""" | |
| <h4>Latest Document: {latest['fileName']}</h4> | |
| <p><strong>Processed at:</strong> {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}</p> | |
| <p><strong>Vision model used:</strong> {'Yes' if latest['useVision'] else 'No'}</p> | |
| """ | |
| tool_container(latest_html) | |
| # History in expander | |
| with st.expander("View Complete Processing History"): | |
| for i, item in enumerate(reversed(st.session_state.processing_history)): | |
| st.markdown(f""" | |
| <div style="background-color: var(--color-gray-700); padding: 0.75rem; border-radius: 0.5rem; margin-bottom: 0.5rem;"> | |
| <strong>{item['fileName']}</strong><br> | |
| {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} - | |
| Vision model: {'Yes' if item['useVision'] else 'No'} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Option to view a previous result | |
| if st.button(f"View This Result", key=f"view_history_{i}"): | |
| st.session_state.current_result = item['result'] | |
| st.experimental_rerun() | |
| # Compare tab for side-by-side comparison | |
| with compare_tab: | |
| st.subheader("Compare OCR Results") | |
| if 'processing_history' in st.session_state and len(st.session_state.processing_history) >= 2: | |
| st.markdown(""" | |
| Select two processing results to compare side by side. This allows you to see | |
| how different options (like using the vision model) affect OCR quality. | |
| """) | |
| # Create selection dropdowns for the documents | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # First document selector | |
| doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})" | |
| for i, item in enumerate(st.session_state.processing_history)] | |
| doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1") | |
| doc_index_1 = int(doc_choice_1.split(":")[0]) - 1 | |
| with col2: | |
| # Second document selector | |
| doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})" | |
| for i, item in enumerate(st.session_state.processing_history)] | |
| default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item | |
| doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index) | |
| doc_index_2 = int(doc_choice_2.split(":")[0]) - 1 | |
| # Retrieve the selected documents | |
| doc1 = st.session_state.processing_history[doc_index_1] | |
| doc2 = st.session_state.processing_history[doc_index_2] | |
| # Show comparison | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| doc1_html = f""" | |
| <h4>Document 1: {doc1['fileName']}</h4> | |
| <p><strong>Processed at:</strong> {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}</p> | |
| <p><strong>Vision model used:</strong> {'Yes' if doc1['useVision'] else 'No'}</p> | |
| """ | |
| tool_container(doc1_html) | |
| # Display content summary | |
| if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict): | |
| if 'content' in doc1['result']['ocr_contents']: | |
| content = doc1['result']['ocr_contents']['content'] | |
| st.markdown(f""" | |
| <div style="max-height: 300px; overflow-y: auto; word-wrap: break-word; | |
| border: 1px solid #374151; padding: 1rem; background-color: #1f2937;"> | |
| {content[:500]}{'...' if len(content) > 500 else ''} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| doc2_html = f""" | |
| <h4>Document 2: {doc2['fileName']}</h4> | |
| <p><strong>Processed at:</strong> {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}</p> | |
| <p><strong>Vision model used:</strong> {'Yes' if doc2['useVision'] else 'No'}</p> | |
| """ | |
| tool_container(doc2_html) | |
| # Display content summary | |
| if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict): | |
| if 'content' in doc2['result']['ocr_contents']: | |
| content = doc2['result']['ocr_contents']['content'] | |
| st.markdown(f""" | |
| <div style="max-height: 300px; overflow-y: auto; word-wrap: break-word; | |
| border: 1px solid #374151; padding: 1rem; background-color: #1f2937;"> | |
| {content[:500]}{'...' if len(content) > 500 else ''} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Comparison analysis | |
| if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']: | |
| comparison_content = """ | |
| <h3>Vision vs. Non-Vision Model Comparison</h3> | |
| <p>You're comparing the same document processed with different models. | |
| This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p> | |
| <p>Look for these differences:</p> | |
| <ul> | |
| <li>Completeness of extracted text</li> | |
| <li>Accuracy of layout understanding</li> | |
| <li>Recognition of complex elements (tables, figures)</li> | |
| <li>Topic and language detection accuracy</li> | |
| </ul> | |
| """ | |
| key_concept(comparison_content) | |
| else: | |
| need_more_content = """ | |
| <h3>Need More Documents to Compare</h3> | |
| <p>Process at least two documents to enable side-by-side comparison. Try processing | |
| the same document with and without the vision model to see the differences in OCR quality.</p> | |
| """ | |
| research_question(need_more_content) | |
| # Analysis guide tab | |
| with analyze_tab: | |
| st.subheader("Analysis Guide") | |
| st.markdown(""" | |
| ### How to Analyze OCR Results | |
| When analyzing OCR results from historical documents, consider these key factors: | |
| 1. **Text Accuracy** | |
| - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1") | |
| - Assess recognition of period-specific typography and writing styles | |
| - Evaluate handling of degraded or damaged text areas | |
| 2. **Structure Preservation** | |
| - Does the OCR maintain paragraph and section breaks? | |
| - Are columns and tabular data correctly preserved? | |
| - How well are page transitions handled? | |
| 3. **Special Elements** | |
| - Recognition of footnotes, marginalia, and annotations | |
| - Handling of illustrations, diagrams, and decorative elements | |
| - Treatment of watermarks, signatures, and stamps | |
| 4. **Metadata Extraction** | |
| - Accuracy of detected languages, topics, and document type | |
| - Identification of dates, names, and key entities | |
| - Recognition of document purpose and context | |
| """) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| challenge_content = """ | |
| <h3>Common OCR Challenges</h3> | |
| <ul> | |
| <li><strong>Typography Variations</strong>: Historical fonts that differ from modern text</li> | |
| <li><strong>Material Degradation</strong>: Fading, stains, tears affecting legibility</li> | |
| <li><strong>Handwritten Elements</strong>: Marginalia, signatures, and annotations</li> | |
| <li><strong>Complex Layouts</strong>: Multi-column formats and decorative elements</li> | |
| <li><strong>Language and Terminology</strong>: Archaic terms and multilingual content</li> | |
| </ul> | |
| """ | |
| gray_container(challenge_content) | |
| with col2: | |
| tips_content = """ | |
| <h3>Making the Most of OCR Results</h3> | |
| <ul> | |
| <li><strong>Contextual Reading</strong>: Use context to interpret unclear passages</li> | |
| <li><strong>Error Patterns</strong>: Identify and correct systematic OCR errors</li> | |
| <li><strong>Hybrid Analysis</strong>: Combine OCR search with close reading</li> | |
| <li><strong>Comparative Processing</strong>: Try different settings on documents</li> | |
| <li><strong>Iterative Refinement</strong>: Use insights to improve future processing</li> | |
| </ul> | |
| """ | |
| gray_container(tips_content) | |
| # Show example analysis if there's processing history | |
| if 'processing_history' in st.session_state and st.session_state.processing_history: | |
| with st.expander("Example Analysis from Your Documents"): | |
| # Pick the latest document | |
| latest = st.session_state.processing_history[-1] | |
| st.markdown(f""" | |
| #### Sample Analysis for: {latest['fileName']} | |
| **Document Context:** | |
| - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))} | |
| - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))} | |
| - Vision model used: {'Yes' if latest['useVision'] else 'No'} | |
| **What to Look For:** | |
| 1. Check how well the model identified key topics and languages | |
| 2. Evaluate the completeness of extracted text | |
| 3. Note any systematic errors in text recognition | |
| 4. Assess how well document structure was preserved | |
| """) |