Spaces:
Running
Running
Fix sample document loading and processing pipeline
Browse files- Fixed sample document loading to automatically process after selection
- Enhanced SampleDocument class with better file emulation
- Added session state management for reliable sample processing
- Improved user feedback during sample document processing
- Updated CLAUDE.md with improved documentation
CLAUDE.md
CHANGED
|
@@ -8,6 +8,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
| 8 |
- Process PDF files: `python pdf_ocr.py <file_path>`
|
| 9 |
- Process single file with logging: `python process_file.py <file_path>`
|
| 10 |
- Run newspaper test: `python test_newspaper.py <file_path>`
|
|
|
|
| 11 |
- Run typechecking: `mypy .`
|
| 12 |
- Lint code: `ruff check .` or `flake8`
|
| 13 |
|
|
@@ -23,6 +24,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
| 23 |
- **Naming**: snake_case for variables/functions, PascalCase for classes
|
| 24 |
- **Documentation**: Google-style docstrings for all functions/classes
|
| 25 |
- **Logging**: Use module-level loggers with appropriate log levels
|
|
|
|
| 26 |
- **Line length**: ≤100 characters
|
| 27 |
|
| 28 |
## Architecture
|
|
@@ -30,4 +32,5 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
|
|
| 30 |
- Utils: `ocr_utils.py` - OCR text and image processing utilities
|
| 31 |
- PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
|
| 32 |
- Config: `config.py` - Configuration settings and API keys
|
| 33 |
-
- Web: `app.py` - Streamlit interface with UI components in `/ui` directory
|
|
|
|
|
|
| 8 |
- Process PDF files: `python pdf_ocr.py <file_path>`
|
| 9 |
- Process single file with logging: `python process_file.py <file_path>`
|
| 10 |
- Run newspaper test: `python test_newspaper.py <file_path>`
|
| 11 |
+
- Run notebook demo: `jupyter notebook notebook_demo.ipynb`
|
| 12 |
- Run typechecking: `mypy .`
|
| 13 |
- Lint code: `ruff check .` or `flake8`
|
| 14 |
|
|
|
|
| 24 |
- **Naming**: snake_case for variables/functions, PascalCase for classes
|
| 25 |
- **Documentation**: Google-style docstrings for all functions/classes
|
| 26 |
- **Logging**: Use module-level loggers with appropriate log levels
|
| 27 |
+
- **Exception handling**: Implement graceful fallbacks for API errors
|
| 28 |
- **Line length**: ≤100 characters
|
| 29 |
|
| 30 |
## Architecture
|
|
|
|
| 32 |
- Utils: `ocr_utils.py` - OCR text and image processing utilities
|
| 33 |
- PDF handling: `pdf_ocr.py` - PDF-specific processing functionality
|
| 34 |
- Config: `config.py` - Configuration settings and API keys
|
| 35 |
+
- Web: `app.py` - Streamlit interface with UI components in `/ui` directory
|
| 36 |
+
- Demo: `notebook_demo.ipynb` - Interactive notebook with educational examples
|
app.py
CHANGED
|
@@ -511,12 +511,12 @@ with main_tab1:
|
|
| 511 |
# Add heading for the file uploader (just text, no container)
|
| 512 |
st.markdown('### Upload Document')
|
| 513 |
|
| 514 |
-
# Model info
|
| 515 |
-
st.markdown("Using the latest `mistral-ocr-latest` model for advanced document understanding.")
|
| 516 |
|
| 517 |
# Enhanced file uploader with better help text
|
| 518 |
uploaded_file = st.file_uploader("Drag and drop PDFs or images here", type=["pdf", "png", "jpg", "jpeg"],
|
| 519 |
-
help="
|
| 520 |
|
| 521 |
# Removed seed prompt instructions from here, moving to sidebar
|
| 522 |
|
|
@@ -917,6 +917,8 @@ with main_tab2:
|
|
| 917 |
badge_color = "#6a1b9a" # Purple for document types
|
| 918 |
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
| 919 |
badge_color = "#2e7d32" # Green for subject domains
|
|
|
|
|
|
|
| 920 |
|
| 921 |
st.markdown(
|
| 922 |
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
|
@@ -1193,6 +1195,27 @@ with main_tab3:
|
|
| 1193 |
""")
|
| 1194 |
|
| 1195 |
with main_tab1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1196 |
if uploaded_file is not None:
|
| 1197 |
# Check file size (cap at 50MB)
|
| 1198 |
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
|
@@ -1247,8 +1270,21 @@ with main_tab1:
|
|
| 1247 |
# No extra spacing needed as it will be managed programmatically
|
| 1248 |
metadata_placeholder = st.empty()
|
| 1249 |
|
| 1250 |
-
#
|
| 1251 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1252 |
# Move the progress indicator reference to just below the button
|
| 1253 |
progress_container = progress_placeholder
|
| 1254 |
try:
|
|
@@ -1477,8 +1513,8 @@ with main_tab1:
|
|
| 1477 |
# Only show when custom_prompt exists in the session AND has content, or when the result explicitly states it was applied
|
| 1478 |
has_instructions = ('custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0)
|
| 1479 |
if has_instructions or 'custom_prompt_applied' in result:
|
| 1480 |
-
# Use
|
| 1481 |
-
metadata_html += f'<p
|
| 1482 |
|
| 1483 |
# Close the metadata card
|
| 1484 |
metadata_html += '</div>'
|
|
@@ -1936,6 +1972,63 @@ with main_tab1:
|
|
| 1936 |
|
| 1937 |
if 'ocr_contents' not in result:
|
| 1938 |
st.error("No OCR content was extracted from the document.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1939 |
|
| 1940 |
# Close document content div
|
| 1941 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
@@ -2038,6 +2131,41 @@ with main_tab1:
|
|
| 2038 |
lang_tag = f"{lang} Language"
|
| 2039 |
subject_tags.append(lang_tag)
|
| 2040 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2041 |
except Exception as e:
|
| 2042 |
logger.warning(f"Error generating subject tags: {str(e)}")
|
| 2043 |
# Fallback tags if extraction fails
|
|
@@ -2094,9 +2222,7 @@ with main_tab1:
|
|
| 2094 |
except Exception as e:
|
| 2095 |
st.error(f"Error processing document: {str(e)}")
|
| 2096 |
else:
|
| 2097 |
-
#
|
| 2098 |
-
|
| 2099 |
-
# Show example images in a simpler layout
|
| 2100 |
st.subheader("Example Documents")
|
| 2101 |
|
| 2102 |
# Add a simplified info message about examples
|
|
@@ -2106,9 +2232,115 @@ with main_tab1:
|
|
| 2106 |
- Handwritten letters and documents
|
| 2107 |
- Printed books and articles
|
| 2108 |
- Multi-page PDFs
|
| 2109 |
-
|
| 2110 |
-
Upload your own document to get started or explore the 'About' tab for more information.
|
| 2111 |
""")
|
| 2112 |
|
| 2113 |
-
#
|
| 2114 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
# Add heading for the file uploader (just text, no container)
|
| 512 |
st.markdown('### Upload Document')
|
| 513 |
|
| 514 |
+
# Model info with clearer instructions
|
| 515 |
+
st.markdown("Using the latest `mistral-ocr-latest` model for advanced document understanding. To get started upload your own document, use an example document, or explore the 'About' tab for more info.")
|
| 516 |
|
| 517 |
# Enhanced file uploader with better help text
|
| 518 |
uploaded_file = st.file_uploader("Drag and drop PDFs or images here", type=["pdf", "png", "jpg", "jpeg"],
|
| 519 |
+
help="Limit 200MB per file • PDF, PNG, JPG, JPEG")
|
| 520 |
|
| 521 |
# Removed seed prompt instructions from here, moving to sidebar
|
| 522 |
|
|
|
|
| 917 |
badge_color = "#6a1b9a" # Purple for document types
|
| 918 |
elif any(term in topic.lower() for term in ["travel", "military", "science", "medicine", "education", "art", "literature"]):
|
| 919 |
badge_color = "#2e7d32" # Green for subject domains
|
| 920 |
+
elif any(term in topic.lower() for term in ["preprocessed", "enhanced", "grayscale", "denoised", "contrast", "rotated"]):
|
| 921 |
+
badge_color = "#e65100" # Orange for preprocessing-related tags
|
| 922 |
|
| 923 |
st.markdown(
|
| 924 |
f'<span style="background-color: {badge_color}; color: white; padding: 3px 8px; '
|
|
|
|
| 1195 |
""")
|
| 1196 |
|
| 1197 |
with main_tab1:
|
| 1198 |
+
# Initialize session states if needed
|
| 1199 |
+
if 'auto_process_sample' not in st.session_state:
|
| 1200 |
+
st.session_state.auto_process_sample = False
|
| 1201 |
+
if 'sample_just_loaded' not in st.session_state:
|
| 1202 |
+
st.session_state.sample_just_loaded = False
|
| 1203 |
+
|
| 1204 |
+
# Use uploaded_file or sample_document if available
|
| 1205 |
+
if 'sample_document' in st.session_state and st.session_state.sample_document is not None:
|
| 1206 |
+
# Use the sample document
|
| 1207 |
+
uploaded_file = st.session_state.sample_document
|
| 1208 |
+
# Add a notice about using sample document
|
| 1209 |
+
st.success(f"Using sample document: {uploaded_file.name}")
|
| 1210 |
+
|
| 1211 |
+
# Set auto-process flag in session state if this is a newly loaded sample
|
| 1212 |
+
if st.session_state.sample_just_loaded:
|
| 1213 |
+
st.session_state.auto_process_sample = True
|
| 1214 |
+
st.session_state.sample_just_loaded = False
|
| 1215 |
+
|
| 1216 |
+
# Clear sample document after use to avoid interference with future uploads
|
| 1217 |
+
st.session_state.sample_document = None
|
| 1218 |
+
|
| 1219 |
if uploaded_file is not None:
|
| 1220 |
# Check file size (cap at 50MB)
|
| 1221 |
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
|
|
|
| 1270 |
# No extra spacing needed as it will be managed programmatically
|
| 1271 |
metadata_placeholder = st.empty()
|
| 1272 |
|
| 1273 |
+
# Check if we need to auto-process a sample document
|
| 1274 |
+
if 'auto_process_sample' not in st.session_state:
|
| 1275 |
+
st.session_state.auto_process_sample = False
|
| 1276 |
+
|
| 1277 |
+
# Results section - process if button clicked or auto-process flag is set
|
| 1278 |
+
process_now = process_button or st.session_state.auto_process_sample
|
| 1279 |
+
|
| 1280 |
+
# Show a message if auto-processing
|
| 1281 |
+
if st.session_state.auto_process_sample:
|
| 1282 |
+
st.info("Automatically processing sample document...")
|
| 1283 |
+
|
| 1284 |
+
if process_now:
|
| 1285 |
+
# Reset auto-process flag to avoid processing on next rerun
|
| 1286 |
+
if st.session_state.auto_process_sample:
|
| 1287 |
+
st.session_state.auto_process_sample = False
|
| 1288 |
# Move the progress indicator reference to just below the button
|
| 1289 |
progress_container = progress_placeholder
|
| 1290 |
try:
|
|
|
|
| 1513 |
# Only show when custom_prompt exists in the session AND has content, or when the result explicitly states it was applied
|
| 1514 |
has_instructions = ('custom_prompt' in locals() and custom_prompt and len(str(custom_prompt).strip()) > 0)
|
| 1515 |
if has_instructions or 'custom_prompt_applied' in result:
|
| 1516 |
+
# Use consistent styling with other metadata fields
|
| 1517 |
+
metadata_html += f'<p><strong>Advanced Analysis:</strong> Custom instructions applied</p>'
|
| 1518 |
|
| 1519 |
# Close the metadata card
|
| 1520 |
metadata_html += '</div>'
|
|
|
|
| 1972 |
|
| 1973 |
if 'ocr_contents' not in result:
|
| 1974 |
st.error("No OCR content was extracted from the document.")
|
| 1975 |
+
else:
|
| 1976 |
+
# Check for minimal text content in OCR results
|
| 1977 |
+
has_minimal_text = False
|
| 1978 |
+
total_text_length = 0
|
| 1979 |
+
|
| 1980 |
+
# Check if the document is an image (not a PDF)
|
| 1981 |
+
is_image = result.get('file_name', '').lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))
|
| 1982 |
+
|
| 1983 |
+
# If image file with raw_text only
|
| 1984 |
+
if is_image and 'ocr_contents' in result:
|
| 1985 |
+
ocr_contents = result['ocr_contents']
|
| 1986 |
+
|
| 1987 |
+
# Check if only raw_text exists with minimal content
|
| 1988 |
+
has_raw_text_only = False
|
| 1989 |
+
if 'raw_text' in ocr_contents:
|
| 1990 |
+
raw_text = ocr_contents['raw_text']
|
| 1991 |
+
total_text_length += len(raw_text.strip())
|
| 1992 |
+
|
| 1993 |
+
# Check if raw_text is the only significant field
|
| 1994 |
+
other_content_fields = [k for k in ocr_contents.keys()
|
| 1995 |
+
if k not in ['raw_text', 'error', 'partial_text']
|
| 1996 |
+
and isinstance(ocr_contents[k], (str, list))
|
| 1997 |
+
and ocr_contents[k]]
|
| 1998 |
+
|
| 1999 |
+
if len(other_content_fields) <= 1: # Only raw_text or one other field
|
| 2000 |
+
has_raw_text_only = True
|
| 2001 |
+
|
| 2002 |
+
# Check if minimal text was extracted (less than 50 characters)
|
| 2003 |
+
if total_text_length < 50 and has_raw_text_only:
|
| 2004 |
+
has_minimal_text = True
|
| 2005 |
+
|
| 2006 |
+
# Check if any meaningful preprocessing options were used
|
| 2007 |
+
preprocessing_used = False
|
| 2008 |
+
if preprocessing_options.get("document_type", "standard") != "standard":
|
| 2009 |
+
preprocessing_used = True
|
| 2010 |
+
if preprocessing_options.get("grayscale", False):
|
| 2011 |
+
preprocessing_used = True
|
| 2012 |
+
if preprocessing_options.get("denoise", False):
|
| 2013 |
+
preprocessing_used = True
|
| 2014 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
| 2015 |
+
preprocessing_used = True
|
| 2016 |
+
if preprocessing_options.get("rotation", 0) != 0:
|
| 2017 |
+
preprocessing_used = True
|
| 2018 |
+
|
| 2019 |
+
# If minimal text was found and preprocessing options weren't used
|
| 2020 |
+
if has_minimal_text and not preprocessing_used and uploaded_file.type.startswith('image/'):
|
| 2021 |
+
st.warning("""
|
| 2022 |
+
**Limited text extracted from this image.**
|
| 2023 |
+
|
| 2024 |
+
Try using preprocessing options in the sidebar to improve results:
|
| 2025 |
+
- Convert to grayscale for clearer text
|
| 2026 |
+
- Use denoising for aged or degraded documents
|
| 2027 |
+
- Adjust contrast for faded text
|
| 2028 |
+
- Try different rotation if text orientation is unclear
|
| 2029 |
+
|
| 2030 |
+
Click the "Preprocessing Options" section in the sidebar under "Image Processing".
|
| 2031 |
+
""")
|
| 2032 |
|
| 2033 |
# Close document content div
|
| 2034 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
|
| 2131 |
lang_tag = f"{lang} Language"
|
| 2132 |
subject_tags.append(lang_tag)
|
| 2133 |
|
| 2134 |
+
# Add preprocessing information as tags if preprocessing was applied
|
| 2135 |
+
if uploaded_file.type.startswith('image/'):
|
| 2136 |
+
# Check if meaningful preprocessing options were used
|
| 2137 |
+
if preprocessing_options.get("document_type", "standard") != "standard":
|
| 2138 |
+
doc_type = preprocessing_options["document_type"].capitalize()
|
| 2139 |
+
preprocessing_tag = f"Enhanced ({doc_type})"
|
| 2140 |
+
if preprocessing_tag not in subject_tags:
|
| 2141 |
+
subject_tags.append(preprocessing_tag)
|
| 2142 |
+
|
| 2143 |
+
preprocessing_methods = []
|
| 2144 |
+
if preprocessing_options.get("grayscale", False):
|
| 2145 |
+
preprocessing_methods.append("Grayscale")
|
| 2146 |
+
if preprocessing_options.get("denoise", False):
|
| 2147 |
+
preprocessing_methods.append("Denoised")
|
| 2148 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
| 2149 |
+
contrast_val = preprocessing_options.get("contrast", 0)
|
| 2150 |
+
if contrast_val > 0:
|
| 2151 |
+
preprocessing_methods.append("Contrast Enhanced")
|
| 2152 |
+
else:
|
| 2153 |
+
preprocessing_methods.append("Contrast Reduced")
|
| 2154 |
+
if preprocessing_options.get("rotation", 0) != 0:
|
| 2155 |
+
preprocessing_methods.append("Rotated")
|
| 2156 |
+
|
| 2157 |
+
# Add a combined preprocessing tag if methods were applied
|
| 2158 |
+
if preprocessing_methods:
|
| 2159 |
+
prep_tag = "Preprocessed"
|
| 2160 |
+
if prep_tag not in subject_tags:
|
| 2161 |
+
subject_tags.append(prep_tag)
|
| 2162 |
+
|
| 2163 |
+
# Add the specific method as a tag if only one was used
|
| 2164 |
+
if len(preprocessing_methods) == 1:
|
| 2165 |
+
method_tag = preprocessing_methods[0]
|
| 2166 |
+
if method_tag not in subject_tags:
|
| 2167 |
+
subject_tags.append(method_tag)
|
| 2168 |
+
|
| 2169 |
except Exception as e:
|
| 2170 |
logger.warning(f"Error generating subject tags: {str(e)}")
|
| 2171 |
# Fallback tags if extraction fails
|
|
|
|
| 2222 |
except Exception as e:
|
| 2223 |
st.error(f"Error processing document: {str(e)}")
|
| 2224 |
else:
|
| 2225 |
+
# Example Documents section after file uploader
|
|
|
|
|
|
|
| 2226 |
st.subheader("Example Documents")
|
| 2227 |
|
| 2228 |
# Add a simplified info message about examples
|
|
|
|
| 2232 |
- Handwritten letters and documents
|
| 2233 |
- Printed books and articles
|
| 2234 |
- Multi-page PDFs
|
|
|
|
|
|
|
| 2235 |
""")
|
| 2236 |
|
| 2237 |
+
# Add CSS to make the dropdown match the column width
|
| 2238 |
+
st.markdown("""
|
| 2239 |
+
<style>
|
| 2240 |
+
/* Make the selectbox container match the full column width */
|
| 2241 |
+
.main .block-container .element-container:has([data-testid="stSelectbox"]) {
|
| 2242 |
+
width: 100% !important;
|
| 2243 |
+
max-width: 100% !important;
|
| 2244 |
+
}
|
| 2245 |
+
|
| 2246 |
+
/* Make the actual selectbox control take the full width */
|
| 2247 |
+
.stSelectbox > div > div {
|
| 2248 |
+
width: 100% !important;
|
| 2249 |
+
max-width: 100% !important;
|
| 2250 |
+
}
|
| 2251 |
+
</style>
|
| 2252 |
+
""", unsafe_allow_html=True)
|
| 2253 |
+
|
| 2254 |
+
# Sample document URLs dropdown with clearer label
|
| 2255 |
+
sample_urls = [
|
| 2256 |
+
"Select a sample document",
|
| 2257 |
+
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/a-la-carte.pdf",
|
| 2258 |
+
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magician-or-bottle-cungerer.jpg",
|
| 2259 |
+
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/handwritten-letter.jpg",
|
| 2260 |
+
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/magellan-travels.jpg",
|
| 2261 |
+
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/milgram-flier.png",
|
| 2262 |
+
"https://huggingface.co/spaces/milwright/historical-ocr/resolve/main/input/baldwin-15st-north.jpg"
|
| 2263 |
+
]
|
| 2264 |
+
|
| 2265 |
+
sample_names = [
|
| 2266 |
+
"Select a sample document",
|
| 2267 |
+
"Restaurant Menu (PDF)",
|
| 2268 |
+
"The Magician (Image)",
|
| 2269 |
+
"Handwritten Letter (Image)",
|
| 2270 |
+
"Magellan Travels (Image)",
|
| 2271 |
+
"Milgram Flier (Image)",
|
| 2272 |
+
"Baldwin Street (Image)"
|
| 2273 |
+
]
|
| 2274 |
+
|
| 2275 |
+
# Initialize sample_document in session state if it doesn't exist
|
| 2276 |
+
if 'sample_document' not in st.session_state:
|
| 2277 |
+
st.session_state.sample_document = None
|
| 2278 |
+
|
| 2279 |
+
selected_sample = st.selectbox("Select a sample document from `~/input`", options=range(len(sample_urls)), format_func=lambda i: sample_names[i])
|
| 2280 |
+
|
| 2281 |
+
if selected_sample > 0:
|
| 2282 |
+
selected_url = sample_urls[selected_sample]
|
| 2283 |
+
|
| 2284 |
+
# Add process button for the sample document
|
| 2285 |
+
if st.button("Load Sample Document"):
|
| 2286 |
+
try:
|
| 2287 |
+
import requests
|
| 2288 |
+
from io import BytesIO
|
| 2289 |
+
|
| 2290 |
+
with st.spinner(f"Downloading {sample_names[selected_sample]}..."):
|
| 2291 |
+
response = requests.get(selected_url)
|
| 2292 |
+
response.raise_for_status()
|
| 2293 |
+
|
| 2294 |
+
# Extract filename from URL
|
| 2295 |
+
file_name = selected_url.split("/")[-1]
|
| 2296 |
+
|
| 2297 |
+
# Create a BytesIO object from the downloaded content
|
| 2298 |
+
file_content = BytesIO(response.content)
|
| 2299 |
+
|
| 2300 |
+
# Store as a UploadedFile-like object in session state
|
| 2301 |
+
class SampleDocument:
|
| 2302 |
+
def __init__(self, name, content, content_type):
|
| 2303 |
+
self.name = name
|
| 2304 |
+
self._content = content
|
| 2305 |
+
self.type = content_type
|
| 2306 |
+
self.size = len(content)
|
| 2307 |
+
|
| 2308 |
+
def getvalue(self):
|
| 2309 |
+
return self._content
|
| 2310 |
+
|
| 2311 |
+
def read(self):
|
| 2312 |
+
return self._content
|
| 2313 |
+
|
| 2314 |
+
def seek(self, position):
|
| 2315 |
+
# Implement seek for compatibility with some file operations
|
| 2316 |
+
return
|
| 2317 |
+
|
| 2318 |
+
def tell(self):
|
| 2319 |
+
# Implement tell for compatibility
|
| 2320 |
+
return 0
|
| 2321 |
+
|
| 2322 |
+
# Determine content type based on file extension
|
| 2323 |
+
if file_name.lower().endswith('.pdf'):
|
| 2324 |
+
content_type = 'application/pdf'
|
| 2325 |
+
elif file_name.lower().endswith(('.jpg', '.jpeg')):
|
| 2326 |
+
content_type = 'image/jpeg'
|
| 2327 |
+
elif file_name.lower().endswith('.png'):
|
| 2328 |
+
content_type = 'image/png'
|
| 2329 |
+
else:
|
| 2330 |
+
content_type = 'application/octet-stream'
|
| 2331 |
+
|
| 2332 |
+
# Save download info in session state for more reliable handling
|
| 2333 |
+
st.session_state.sample_document = SampleDocument(
|
| 2334 |
+
name=file_name,
|
| 2335 |
+
content=response.content,
|
| 2336 |
+
content_type=content_type
|
| 2337 |
+
)
|
| 2338 |
+
|
| 2339 |
+
# Set a flag to indicate this is a newly loaded sample
|
| 2340 |
+
st.session_state.sample_just_loaded = True
|
| 2341 |
+
|
| 2342 |
+
# Force rerun to load the document
|
| 2343 |
+
st.rerun()
|
| 2344 |
+
except Exception as e:
|
| 2345 |
+
st.error(f"Error downloading sample document: {str(e)}")
|
| 2346 |
+
st.info("Please try uploading your own document instead.")
|