Spaces:

Qasim-Dost
/

Doc-Classifier

Runtime error

App Files Files Community

Doc-Classifier / app.py

Qasim-Dost

Upload 5 files

ce9f3ac verified 28 days ago

raw

history blame contribute delete

14.5 kB

	"""
	Streamlit UI for Document Classification
	Upload PDFs and classify them using SmolVLM.
	Optimized with pre-loading and concurrent processing.
	"""

	import streamlit as st
	import pandas as pd
	import json
	from pathlib import Path
	from datetime import datetime
	import tempfile
	import os
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import threading

	# Import our classifier modules
	from pdf_to_image import pdf_to_images
	from smolvlm_classifier import SmolVLMClassifier


	# Page config
	st.set_page_config(
	page_title="Document Classifier",
	page_icon="📄",
	layout="wide"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	color: #1f77b4;
	margin-bottom: 1rem;
	}
	.result-box {
	background-color: #f0f8ff;
	padding: 0.8rem 1rem;
	border-radius: 8px;
	border-left: 4px solid #1f77b4;
	margin: 0.5rem 0;
	display: inline-block;
	}
	.doc-type {
	font-size: 1.2rem;
	font-weight: bold;
	color: #2e7d32;
	margin: 0;
	}
	.file-info {
	font-size: 0.9rem;
	color: #555;
	margin: 0.2rem 0;
	}
	.model-status {
	padding: 0.5rem;
	border-radius: 5px;
	margin-bottom: 1rem;
	}
	</style>
	""", unsafe_allow_html=True)


	@st.cache_resource
	def load_classifier():
	"""Load the classifier once and cache it."""
	return SmolVLMClassifier()


	def load_history():
	"""Load classification history from JSON file."""
	history_file = Path("classification_history.json")
	if history_file.exists():
	with open(history_file, "r", encoding="utf-8") as f:
	return json.load(f)
	return []


	def save_history(history):
	"""Save classification history to JSON file."""
	with open("classification_history.json", "w", encoding="utf-8") as f:
	json.dump(history, f, indent=2, ensure_ascii=False)


	def add_to_history(filename, doc_type, num_pages):
	"""Add a classification result to history."""
	history = load_history()
	history.insert(0, {
	"filename": filename,
	"document_type": doc_type,
	"num_pages": num_pages,
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	})
	# Keep only last 100 entries
	history = history[:100]
	save_history(history)
	return history


	def convert_pdf_to_images(uploaded_file):
	"""Convert a single PDF to images. Used for threading."""
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_path = tmp_file.name
	try:
	images = pdf_to_images(tmp_path, dpi=100)
	return uploaded_file.name, images
	finally:
	os.unlink(tmp_path)


	def main():
	# Header
	st.markdown('<div class="main-header">📄 Document Classifier</div>', unsafe_allow_html=True)
	st.markdown("Upload PDF documents to classify them using SmolVLM AI.")

	# PRE-LOAD MODEL AT APP START (not on button click)
	# This runs once when the app starts
	with st.spinner("🔄 Loading AI model (one-time setup)..."):
	classifier = load_classifier()
	st.success("✅ Model ready!")

	# Sidebar for history
	with st.sidebar:
	st.header("📋 Classification History")
	history = load_history()

	if history:
	# Show as table
	df_history = pd.DataFrame(history)
	st.dataframe(
	df_history[["filename", "document_type", "timestamp"]],
	hide_index=True,
	width="stretch"
	)

	# Clear history button
	if st.button("🗑️ Clear History"):
	save_history([])
	st.rerun()
	else:
	st.info("No classification history yet. Upload a document to get started!")

	# Main content - two columns
	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("📤 Upload Documents")

	# File uploader - MULTIPLE FILES
	uploaded_files = st.file_uploader(
	"Choose PDF files",
	type=["pdf"],
	accept_multiple_files=True,
	help="Upload one or more PDF documents to classify"
	)

	if uploaded_files:
	st.success(f"✅ Uploaded {len(uploaded_files)} file(s)")

	# Store images for preview
	if "pdf_previews" not in st.session_state:
	st.session_state["pdf_previews"] = {}

	# Show file list with preview option
	for f in uploaded_files:
	with st.expander(f"📄 {f.name} ({f.size / 1024:.1f} KB)", expanded=False):
	# Generate preview if not cached
	if f.name not in st.session_state["pdf_previews"]:
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(f.getvalue())
	tmp_path = tmp_file.name
	try:
	images = pdf_to_images(tmp_path, dpi=100)
	st.session_state["pdf_previews"][f.name] = images
	finally:
	os.unlink(tmp_path)

	# Show preview
	images = st.session_state["pdf_previews"].get(f.name, [])
	if images:
	if len(images) > 1:
	page_num = st.selectbox(
	f"Page",
	range(1, len(images) + 1),
	key=f"page_{f.name}"
	)
	st.image(images[page_num - 1], caption=f"Page {page_num} of {len(images)}", width="stretch")
	else:
	st.image(images[0], caption="Page 1", width="stretch")
	else:
	st.error("Could not load PDF preview")

	# Classify button
	if st.button("🔍 Classify All Documents", type="primary", width="stretch"):
	import time

	all_results = []
	progress_bar = st.progress(0)
	status_text = st.empty()

	total_start_time = time.time()

	# STEP 1: Pre-convert all PDFs to images using threading
	status_text.text("📄 Converting PDFs to images (parallel)...")
	pdf_conversion_start = time.time()

	pdf_images = {}

	# Use ThreadPoolExecutor for parallel PDF conversion
	with ThreadPoolExecutor(max_workers=4) as executor:
	# Submit all PDF conversion tasks
	future_to_file = {
	executor.submit(convert_pdf_to_images, f): f
	for f in uploaded_files
	if f.name not in st.session_state.get("pdf_previews", {})
	}

	# Also add cached previews
	for f in uploaded_files:
	if f.name in st.session_state.get("pdf_previews", {}):
	pdf_images[f.name] = st.session_state["pdf_previews"][f.name]

	# Collect results
	for future in as_completed(future_to_file):
	filename, images = future.result()
	pdf_images[filename] = images

	pdf_conversion_time = time.time() - pdf_conversion_start
	print(f"\n📄 PDF Conversion: {pdf_conversion_time:.2f}s (parallel)")

	progress_bar.progress(0.2)
	status_text.text("🤖 Classifying documents...")

	# STEP 2: Classify each document with timing
	classification_start = time.time()

	for idx, uploaded_file in enumerate(uploaded_files):
	doc_start_time = time.time()
	images = pdf_images.get(uploaded_file.name, [])

	if not images:
	result = {
	"filename": uploaded_file.name,
	"document_type": "Error: Could not extract pages",
	"num_pages": 0,
	"classify_time": 0
	}
	else:
	status_text.text(f"🤖 Classifying {idx + 1}/{len(uploaded_files)}: {uploaded_file.name}")

	# Classify with timing
	classify_start = time.time()
	classification = classifier.classify_document(images)
	classify_time = time.time() - classify_start

	result = {
	"filename": uploaded_file.name,
	"document_type": classification["document_type"],
	"num_pages": classification["num_pages"],
	"classify_time": round(classify_time, 2)
	}

	# Terminal output
	print(f" 📄 {uploaded_file.name}")
	print(f" Pages: {classification['num_pages']}")
	print(f" Type: {classification['document_type']}")
	print(f" Classification time: {classify_time:.2f}s")

	# Add to history
	add_to_history(
	uploaded_file.name,
	classification["document_type"],
	classification["num_pages"]
	)

	all_results.append(result)

	# Update progress
	progress_bar.progress(0.2 + 0.8 * (idx + 1) / len(uploaded_files))

	total_classification_time = time.time() - classification_start
	total_time = time.time() - total_start_time

	# Print summary to terminal
	print(f"\n{'='*50}")
	print("TIMING SUMMARY")
	print(f"{'='*50}")
	print(f"Documents processed: {len(all_results)}")
	print(f"PDF conversion (parallel): {pdf_conversion_time:.2f}s")
	print(f"Classification (sequential): {total_classification_time:.2f}s")
	print(f"Average per document: {total_classification_time/len(all_results):.2f}s")
	print(f"Total time: {total_time:.2f}s ({total_time/60:.1f} min)")
	print(f"{'='*50}\n")

	# Store timing info
	st.session_state["timing"] = {
	"pdf_conversion": round(pdf_conversion_time, 2),
	"classification": round(total_classification_time, 2),
	"total": round(total_time, 2),
	"total_min": round(total_time / 60, 2),
	"avg_per_doc": round(total_classification_time / len(all_results), 2)
	}

	status_text.text(f"✅ Complete! Total: {total_time:.1f}s ({total_time/60:.1f} min)")
	st.session_state["results"] = all_results

	with col2:
	st.subheader("📊 Classification Results")

	# Show results
	if "results" in st.session_state and st.session_state["results"]:
	results = st.session_state["results"]

	# Show as compact table with timing
	df_results = pd.DataFrame(results)
	st.dataframe(
	df_results,
	hide_index=True,
	width="stretch",
	column_config={
	"filename": st.column_config.TextColumn("File", width="medium"),
	"document_type": st.column_config.TextColumn("Type", width="medium"),
	"num_pages": st.column_config.NumberColumn("Pages", width="small"),
	"classify_time": st.column_config.NumberColumn("Time (s)", width="small")
	}
	)

	# Show timing summary if available
	if "timing" in st.session_state:
	timing = st.session_state["timing"]
	st.markdown("---")
	st.markdown("⏱️ Timing Summary")
	col_t1, col_t2, col_t3 = st.columns(3)
	with col_t1:
	st.metric("PDF Conversion", f"{timing['pdf_conversion']}s")
	with col_t2:
	st.metric("Classification", f"{timing['classification']}s")
	with col_t3:
	st.metric("Avg per Doc", f"{timing['avg_per_doc']}s")

	st.info(f"Total Time: {timing['total']}s ({timing['total_min']} min)")

	# Summary
	st.success(f"✅ Classified {len(results)} document(s)")

	# Show individual result boxes (compact)
	for result in results:
	st.markdown(f"""
	<div class="result-box">
	<p class="file-info"><strong>{result['filename']}</strong> ({result['num_pages']} pages)</p>
	<p class="doc-type">📑 {result['document_type']}</p>
	</div>
	""", unsafe_allow_html=True)

	else:
	st.info("👆 Upload and classify documents to see results here.")


	if __name__ == "__main__":
	main()