Spaces:
Runtime error
Runtime error
| """ | |
| Streamlit UI for Document Classification | |
| Upload PDFs and classify them using SmolVLM. | |
| Optimized with pre-loading and concurrent processing. | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| from pathlib import Path | |
| from datetime import datetime | |
| import tempfile | |
| import os | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import threading | |
| # Import our classifier modules | |
| from pdf_to_image import pdf_to_images | |
| from smolvlm_classifier import SmolVLMClassifier | |
| # Page config | |
| st.set_page_config( | |
| page_title="Document Classifier", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # Custom CSS for better styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| font-weight: bold; | |
| color: #1f77b4; | |
| margin-bottom: 1rem; | |
| } | |
| .result-box { | |
| background-color: #f0f8ff; | |
| padding: 0.8rem 1rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #1f77b4; | |
| margin: 0.5rem 0; | |
| display: inline-block; | |
| } | |
| .doc-type { | |
| font-size: 1.2rem; | |
| font-weight: bold; | |
| color: #2e7d32; | |
| margin: 0; | |
| } | |
| .file-info { | |
| font-size: 0.9rem; | |
| color: #555; | |
| margin: 0.2rem 0; | |
| } | |
| .model-status { | |
| padding: 0.5rem; | |
| border-radius: 5px; | |
| margin-bottom: 1rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| def load_classifier(): | |
| """Load the classifier once and cache it.""" | |
| return SmolVLMClassifier() | |
| def load_history(): | |
| """Load classification history from JSON file.""" | |
| history_file = Path("classification_history.json") | |
| if history_file.exists(): | |
| with open(history_file, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| return [] | |
| def save_history(history): | |
| """Save classification history to JSON file.""" | |
| with open("classification_history.json", "w", encoding="utf-8") as f: | |
| json.dump(history, f, indent=2, ensure_ascii=False) | |
| def add_to_history(filename, doc_type, num_pages): | |
| """Add a classification result to history.""" | |
| history = load_history() | |
| history.insert(0, { | |
| "filename": filename, | |
| "document_type": doc_type, | |
| "num_pages": num_pages, | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| }) | |
| # Keep only last 100 entries | |
| history = history[:100] | |
| save_history(history) | |
| return history | |
| def convert_pdf_to_images(uploaded_file): | |
| """Convert a single PDF to images. Used for threading.""" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
| tmp_file.write(uploaded_file.getvalue()) | |
| tmp_path = tmp_file.name | |
| try: | |
| images = pdf_to_images(tmp_path, dpi=100) | |
| return uploaded_file.name, images | |
| finally: | |
| os.unlink(tmp_path) | |
| def main(): | |
| # Header | |
| st.markdown('<div class="main-header">π Document Classifier</div>', unsafe_allow_html=True) | |
| st.markdown("Upload PDF documents to classify them using SmolVLM AI.") | |
| # PRE-LOAD MODEL AT APP START (not on button click) | |
| # This runs once when the app starts | |
| with st.spinner("π Loading AI model (one-time setup)..."): | |
| classifier = load_classifier() | |
| st.success("β Model ready!") | |
| # Sidebar for history | |
| with st.sidebar: | |
| st.header("π Classification History") | |
| history = load_history() | |
| if history: | |
| # Show as table | |
| df_history = pd.DataFrame(history) | |
| st.dataframe( | |
| df_history[["filename", "document_type", "timestamp"]], | |
| hide_index=True, | |
| width="stretch" | |
| ) | |
| # Clear history button | |
| if st.button("ποΈ Clear History"): | |
| save_history([]) | |
| st.rerun() | |
| else: | |
| st.info("No classification history yet. Upload a document to get started!") | |
| # Main content - two columns | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| st.subheader("π€ Upload Documents") | |
| # File uploader - MULTIPLE FILES | |
| uploaded_files = st.file_uploader( | |
| "Choose PDF files", | |
| type=["pdf"], | |
| accept_multiple_files=True, | |
| help="Upload one or more PDF documents to classify" | |
| ) | |
| if uploaded_files: | |
| st.success(f"β Uploaded {len(uploaded_files)} file(s)") | |
| # Store images for preview | |
| if "pdf_previews" not in st.session_state: | |
| st.session_state["pdf_previews"] = {} | |
| # Show file list with preview option | |
| for f in uploaded_files: | |
| with st.expander(f"π {f.name} ({f.size / 1024:.1f} KB)", expanded=False): | |
| # Generate preview if not cached | |
| if f.name not in st.session_state["pdf_previews"]: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
| tmp_file.write(f.getvalue()) | |
| tmp_path = tmp_file.name | |
| try: | |
| images = pdf_to_images(tmp_path, dpi=100) | |
| st.session_state["pdf_previews"][f.name] = images | |
| finally: | |
| os.unlink(tmp_path) | |
| # Show preview | |
| images = st.session_state["pdf_previews"].get(f.name, []) | |
| if images: | |
| if len(images) > 1: | |
| page_num = st.selectbox( | |
| f"Page", | |
| range(1, len(images) + 1), | |
| key=f"page_{f.name}" | |
| ) | |
| st.image(images[page_num - 1], caption=f"Page {page_num} of {len(images)}", width="stretch") | |
| else: | |
| st.image(images[0], caption="Page 1", width="stretch") | |
| else: | |
| st.error("Could not load PDF preview") | |
| # Classify button | |
| if st.button("π Classify All Documents", type="primary", width="stretch"): | |
| import time | |
| all_results = [] | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| total_start_time = time.time() | |
| # STEP 1: Pre-convert all PDFs to images using threading | |
| status_text.text("π Converting PDFs to images (parallel)...") | |
| pdf_conversion_start = time.time() | |
| pdf_images = {} | |
| # Use ThreadPoolExecutor for parallel PDF conversion | |
| with ThreadPoolExecutor(max_workers=4) as executor: | |
| # Submit all PDF conversion tasks | |
| future_to_file = { | |
| executor.submit(convert_pdf_to_images, f): f | |
| for f in uploaded_files | |
| if f.name not in st.session_state.get("pdf_previews", {}) | |
| } | |
| # Also add cached previews | |
| for f in uploaded_files: | |
| if f.name in st.session_state.get("pdf_previews", {}): | |
| pdf_images[f.name] = st.session_state["pdf_previews"][f.name] | |
| # Collect results | |
| for future in as_completed(future_to_file): | |
| filename, images = future.result() | |
| pdf_images[filename] = images | |
| pdf_conversion_time = time.time() - pdf_conversion_start | |
| print(f"\nπ PDF Conversion: {pdf_conversion_time:.2f}s (parallel)") | |
| progress_bar.progress(0.2) | |
| status_text.text("π€ Classifying documents...") | |
| # STEP 2: Classify each document with timing | |
| classification_start = time.time() | |
| for idx, uploaded_file in enumerate(uploaded_files): | |
| doc_start_time = time.time() | |
| images = pdf_images.get(uploaded_file.name, []) | |
| if not images: | |
| result = { | |
| "filename": uploaded_file.name, | |
| "document_type": "Error: Could not extract pages", | |
| "num_pages": 0, | |
| "classify_time": 0 | |
| } | |
| else: | |
| status_text.text(f"π€ Classifying {idx + 1}/{len(uploaded_files)}: {uploaded_file.name}") | |
| # Classify with timing | |
| classify_start = time.time() | |
| classification = classifier.classify_document(images) | |
| classify_time = time.time() - classify_start | |
| result = { | |
| "filename": uploaded_file.name, | |
| "document_type": classification["document_type"], | |
| "num_pages": classification["num_pages"], | |
| "classify_time": round(classify_time, 2) | |
| } | |
| # Terminal output | |
| print(f" π {uploaded_file.name}") | |
| print(f" Pages: {classification['num_pages']}") | |
| print(f" Type: {classification['document_type']}") | |
| print(f" Classification time: {classify_time:.2f}s") | |
| # Add to history | |
| add_to_history( | |
| uploaded_file.name, | |
| classification["document_type"], | |
| classification["num_pages"] | |
| ) | |
| all_results.append(result) | |
| # Update progress | |
| progress_bar.progress(0.2 + 0.8 * (idx + 1) / len(uploaded_files)) | |
| total_classification_time = time.time() - classification_start | |
| total_time = time.time() - total_start_time | |
| # Print summary to terminal | |
| print(f"\n{'='*50}") | |
| print("TIMING SUMMARY") | |
| print(f"{'='*50}") | |
| print(f"Documents processed: {len(all_results)}") | |
| print(f"PDF conversion (parallel): {pdf_conversion_time:.2f}s") | |
| print(f"Classification (sequential): {total_classification_time:.2f}s") | |
| print(f"Average per document: {total_classification_time/len(all_results):.2f}s") | |
| print(f"Total time: {total_time:.2f}s ({total_time/60:.1f} min)") | |
| print(f"{'='*50}\n") | |
| # Store timing info | |
| st.session_state["timing"] = { | |
| "pdf_conversion": round(pdf_conversion_time, 2), | |
| "classification": round(total_classification_time, 2), | |
| "total": round(total_time, 2), | |
| "total_min": round(total_time / 60, 2), | |
| "avg_per_doc": round(total_classification_time / len(all_results), 2) | |
| } | |
| status_text.text(f"β Complete! Total: {total_time:.1f}s ({total_time/60:.1f} min)") | |
| st.session_state["results"] = all_results | |
| with col2: | |
| st.subheader("π Classification Results") | |
| # Show results | |
| if "results" in st.session_state and st.session_state["results"]: | |
| results = st.session_state["results"] | |
| # Show as compact table with timing | |
| df_results = pd.DataFrame(results) | |
| st.dataframe( | |
| df_results, | |
| hide_index=True, | |
| width="stretch", | |
| column_config={ | |
| "filename": st.column_config.TextColumn("File", width="medium"), | |
| "document_type": st.column_config.TextColumn("Type", width="medium"), | |
| "num_pages": st.column_config.NumberColumn("Pages", width="small"), | |
| "classify_time": st.column_config.NumberColumn("Time (s)", width="small") | |
| } | |
| ) | |
| # Show timing summary if available | |
| if "timing" in st.session_state: | |
| timing = st.session_state["timing"] | |
| st.markdown("---") | |
| st.markdown("**β±οΈ Timing Summary**") | |
| col_t1, col_t2, col_t3 = st.columns(3) | |
| with col_t1: | |
| st.metric("PDF Conversion", f"{timing['pdf_conversion']}s") | |
| with col_t2: | |
| st.metric("Classification", f"{timing['classification']}s") | |
| with col_t3: | |
| st.metric("Avg per Doc", f"{timing['avg_per_doc']}s") | |
| st.info(f"**Total Time:** {timing['total']}s ({timing['total_min']} min)") | |
| # Summary | |
| st.success(f"β Classified {len(results)} document(s)") | |
| # Show individual result boxes (compact) | |
| for result in results: | |
| st.markdown(f""" | |
| <div class="result-box"> | |
| <p class="file-info"><strong>{result['filename']}</strong> ({result['num_pages']} pages)</p> | |
| <p class="doc-type">π {result['document_type']}</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.info("π Upload and classify documents to see results here.") | |
| if __name__ == "__main__": | |
| main() | |