""" Streamlit UI for Document Classification Upload PDFs and classify them using SmolVLM. Optimized with pre-loading and concurrent processing. """ import streamlit as st import pandas as pd import json from pathlib import Path from datetime import datetime import tempfile import os from concurrent.futures import ThreadPoolExecutor, as_completed import threading # Import our classifier modules from pdf_to_image import pdf_to_images from smolvlm_classifier import SmolVLMClassifier # Page config st.set_page_config( page_title="Document Classifier", page_icon="📄", layout="wide" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def load_classifier(): """Load the classifier once and cache it.""" return SmolVLMClassifier() def load_history(): """Load classification history from JSON file.""" history_file = Path("classification_history.json") if history_file.exists(): with open(history_file, "r", encoding="utf-8") as f: return json.load(f) return [] def save_history(history): """Save classification history to JSON file.""" with open("classification_history.json", "w", encoding="utf-8") as f: json.dump(history, f, indent=2, ensure_ascii=False) def add_to_history(filename, doc_type, num_pages): """Add a classification result to history.""" history = load_history() history.insert(0, { "filename": filename, "document_type": doc_type, "num_pages": num_pages, "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") }) # Keep only last 100 entries history = history[:100] save_history(history) return history def convert_pdf_to_images(uploaded_file): """Convert a single PDF to images. Used for threading.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_path = tmp_file.name try: images = pdf_to_images(tmp_path, dpi=100) return uploaded_file.name, images finally: os.unlink(tmp_path) def main(): # Header st.markdown('

📄 Document Classifier

', unsafe_allow_html=True) st.markdown("Upload PDF documents to classify them using SmolVLM AI.") # PRE-LOAD MODEL AT APP START (not on button click) # This runs once when the app starts with st.spinner("🔄 Loading AI model (one-time setup)..."): classifier = load_classifier() st.success("✅ Model ready!") # Sidebar for history with st.sidebar: st.header("📋 Classification History") history = load_history() if history: # Show as table df_history = pd.DataFrame(history) st.dataframe( df_history[["filename", "document_type", "timestamp"]], hide_index=True, width="stretch" ) # Clear history button if st.button("🗑️ Clear History"): save_history([]) st.rerun() else: st.info("No classification history yet. Upload a document to get started!") # Main content - two columns col1, col2 = st.columns([1, 1]) with col1: st.subheader("📤 Upload Documents") # File uploader - MULTIPLE FILES uploaded_files = st.file_uploader( "Choose PDF files", type=["pdf"], accept_multiple_files=True, help="Upload one or more PDF documents to classify" ) if uploaded_files: st.success(f"✅ Uploaded {len(uploaded_files)} file(s)") # Store images for preview if "pdf_previews" not in st.session_state: st.session_state["pdf_previews"] = {} # Show file list with preview option for f in uploaded_files: with st.expander(f"📄 {f.name} ({f.size / 1024:.1f} KB)", expanded=False): # Generate preview if not cached if f.name not in st.session_state["pdf_previews"]: with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(f.getvalue()) tmp_path = tmp_file.name try: images = pdf_to_images(tmp_path, dpi=100) st.session_state["pdf_previews"][f.name] = images finally: os.unlink(tmp_path) # Show preview images = st.session_state["pdf_previews"].get(f.name, []) if images: if len(images) > 1: page_num = st.selectbox( f"Page", range(1, len(images) + 1), key=f"page_{f.name}" ) st.image(images[page_num - 1], caption=f"Page {page_num} of {len(images)}", width="stretch") else: st.image(images[0], caption="Page 1", width="stretch") else: st.error("Could not load PDF preview") # Classify button if st.button("🔍 Classify All Documents", type="primary", width="stretch"): import time all_results = [] progress_bar = st.progress(0) status_text = st.empty() total_start_time = time.time() # STEP 1: Pre-convert all PDFs to images using threading status_text.text("📄 Converting PDFs to images (parallel)...") pdf_conversion_start = time.time() pdf_images = {} # Use ThreadPoolExecutor for parallel PDF conversion with ThreadPoolExecutor(max_workers=4) as executor: # Submit all PDF conversion tasks future_to_file = { executor.submit(convert_pdf_to_images, f): f for f in uploaded_files if f.name not in st.session_state.get("pdf_previews", {}) } # Also add cached previews for f in uploaded_files: if f.name in st.session_state.get("pdf_previews", {}): pdf_images[f.name] = st.session_state["pdf_previews"][f.name] # Collect results for future in as_completed(future_to_file): filename, images = future.result() pdf_images[filename] = images pdf_conversion_time = time.time() - pdf_conversion_start print(f"\n📄 PDF Conversion: {pdf_conversion_time:.2f}s (parallel)") progress_bar.progress(0.2) status_text.text("🤖 Classifying documents...") # STEP 2: Classify each document with timing classification_start = time.time() for idx, uploaded_file in enumerate(uploaded_files): doc_start_time = time.time() images = pdf_images.get(uploaded_file.name, []) if not images: result = { "filename": uploaded_file.name, "document_type": "Error: Could not extract pages", "num_pages": 0, "classify_time": 0 } else: status_text.text(f"🤖 Classifying {idx + 1}/{len(uploaded_files)}: {uploaded_file.name}") # Classify with timing classify_start = time.time() classification = classifier.classify_document(images) classify_time = time.time() - classify_start result = { "filename": uploaded_file.name, "document_type": classification["document_type"], "num_pages": classification["num_pages"], "classify_time": round(classify_time, 2) } # Terminal output print(f" 📄 {uploaded_file.name}") print(f" Pages: {classification['num_pages']}") print(f" Type: {classification['document_type']}") print(f" Classification time: {classify_time:.2f}s") # Add to history add_to_history( uploaded_file.name, classification["document_type"], classification["num_pages"] ) all_results.append(result) # Update progress progress_bar.progress(0.2 + 0.8 * (idx + 1) / len(uploaded_files)) total_classification_time = time.time() - classification_start total_time = time.time() - total_start_time # Print summary to terminal print(f"\n{'='*50}") print("TIMING SUMMARY") print(f"{'='*50}") print(f"Documents processed: {len(all_results)}") print(f"PDF conversion (parallel): {pdf_conversion_time:.2f}s") print(f"Classification (sequential): {total_classification_time:.2f}s") print(f"Average per document: {total_classification_time/len(all_results):.2f}s") print(f"Total time: {total_time:.2f}s ({total_time/60:.1f} min)") print(f"{'='*50}\n") # Store timing info st.session_state["timing"] = { "pdf_conversion": round(pdf_conversion_time, 2), "classification": round(total_classification_time, 2), "total": round(total_time, 2), "total_min": round(total_time / 60, 2), "avg_per_doc": round(total_classification_time / len(all_results), 2) } status_text.text(f"✅ Complete! Total: {total_time:.1f}s ({total_time/60:.1f} min)") st.session_state["results"] = all_results with col2: st.subheader("📊 Classification Results") # Show results if "results" in st.session_state and st.session_state["results"]: results = st.session_state["results"] # Show as compact table with timing df_results = pd.DataFrame(results) st.dataframe( df_results, hide_index=True, width="stretch", column_config={ "filename": st.column_config.TextColumn("File", width="medium"), "document_type": st.column_config.TextColumn("Type", width="medium"), "num_pages": st.column_config.NumberColumn("Pages", width="small"), "classify_time": st.column_config.NumberColumn("Time (s)", width="small") } ) # Show timing summary if available if "timing" in st.session_state: timing = st.session_state["timing"] st.markdown("---") st.markdown("**⏱️ Timing Summary**") col_t1, col_t2, col_t3 = st.columns(3) with col_t1: st.metric("PDF Conversion", f"{timing['pdf_conversion']}s") with col_t2: st.metric("Classification", f"{timing['classification']}s") with col_t3: st.metric("Avg per Doc", f"{timing['avg_per_doc']}s") st.info(f"**Total Time:** {timing['total']}s ({timing['total_min']} min)") # Summary st.success(f"✅ Classified {len(results)} document(s)") # Show individual result boxes (compact) for result in results: st.markdown(f"""

{result['filename']} ({result['num_pages']} pages)

📑 {result['document_type']}

""", unsafe_allow_html=True) else: st.info("👆 Upload and classify documents to see results here.") if __name__ == "__main__": main()