Spaces:

GoldiSahoo
/

AIDocScanner

Paused

App Files Files Community

AIDocScanner / src /streamlit_app.py

GoldiSahoo

Update src/streamlit_app.py

0a1cd54 verified 3 months ago

raw

history blame contribute delete

16.2 kB

	import streamlit as st
	import os
	import tempfile
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from document_classifier import DocumentClassifier
	import time
	from typing import List, Dict
	import json
	import requests

	# Page configuration
	st.set_page_config(
	page_title="Document Classifier",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 3rem;
	color: #1f77b4;
	text-align: center;
	margin-bottom: 2rem;
	}
	.sub-header {
	font-size: 1.5rem;
	color: #2c3e50;
	margin-top: 2rem;
	margin-bottom: 1rem;
	}
	.metric-card {
	background-color: #f8f9fa;
	padding: 1rem;
	border-radius: 0.5rem;
	border-left: 4px solid #1f77b4;
	}
	.success-message {
	background-color: #d4edda;
	color: #155724;
	padding: 1rem;
	border-radius: 0.5rem;
	border: 1px solid #c3e6cb;
	}
	.error-message {
	background-color: #f8d7da;
	color: #721c24;
	padding: 1rem;
	border-radius: 0.5rem;
	border: 1px solid #f5c6cb;
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'classifier' not in st.session_state:
	st.session_state.classifier = None
	if 'classification_results' not in st.session_state:
	st.session_state.classification_results = []
	if 'uploaded_files' not in st.session_state:
	st.session_state.uploaded_files = []

	def initialize_classifier():
	"""Initialize the document classifier."""
	if st.session_state.classifier is None:
	with st.spinner("Loading Hugging Face models..."):
	try:
	st.session_state.classifier = DocumentClassifier()
	st.success("✅ Document classifier initialized successfully!")
	return True
	except Exception as e:
	st.error(f"❌ Failed to initialize classifier: {str(e)}")
	return False
	return True

	def save_uploaded_file(uploaded_file) -> str:
	"""Save uploaded file to temporary directory."""
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
	tmp_file.write(uploaded_file.getbuffer())
	return tmp_file.name
	except Exception as e:
	st.error(f"Error saving file: {str(e)}")
	return None

	def classify_single_file(file_path: str) -> Dict:
	"""Classify a single file."""
	if not st.session_state.classifier:
	return {"error": "Classifier not initialized", "success": False}

	try:
	result = st.session_state.classifier.classify_document(file_path)
	return result
	except Exception as e:
	return {"error": str(e), "success": False}

	def classify_multiple_files(file_paths: List[str]) -> List[Dict]:
	"""Classify multiple files."""
	if not st.session_state.classifier:
	return [{"error": "Classifier not initialized", "success": False}]

	try:
	results = st.session_state.classifier.classify_multiple_documents(file_paths)
	return results
	except Exception as e:
	return [{"error": str(e), "success": False}]

	def classify_with_api(file_path: str) -> Dict:
	"""Call FastAPI classify endpoint with image file."""
	api_url = "http://localhost:8000/classify" # Adjust if API runs elsewhere
	try:
	with open(file_path, "rb") as file_data:
	files = {"file": (os.path.basename(file_path), file_data)}
	response = requests.post(api_url, files=files)
	if response.status_code == 200:
	data = response.json()
	# Some keys might not be present in API/minimal - match display_classification_result
	data.setdefault('file_path', file_path)
	data.setdefault('file_name', os.path.basename(file_path))
	data.setdefault('file_extension', os.path.splitext(file_path)[1].replace('.', ''))
	data.setdefault('content_length', 0)
	data.setdefault('text_preview', '')
	data.setdefault('file_type', os.path.splitext(file_path)[1].replace('.', ''))
	data.setdefault('all_scores', {})
	return data
	else:
	return {"error": response.text, "success": False}
	except Exception as e:
	return {"error": str(e), "success": False}

	def display_classification_result(result: Dict):
	"""Display a single classification result."""
	if not result.get('success', False):
	st.error(f"❌ Classification failed: {result.get('error', 'Unknown error')}")
	return

	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Document Type", result['file_type'])

	with col2:
	st.metric("Classification", result['classification'].title())

	with col3:
	st.metric("Confidence", f"{result['confidence']:.2%}")

	# Display detailed information
	st.subheader("📋 Document Details")

	col1, col2 = st.columns(2)

	with col1:
	st.write(f"File Name: {result['file_name']}")
	st.write(f"File Extension: {result['file_extension']}")
	st.write(f"Content Length: {result['content_length']} characters")

	with col2:
	st.write(f"File Path: {result['file_path']}")
	st.write(f"Classification Confidence: {result['confidence']:.2%}")

	# Display text preview
	if result['text_preview']:
	st.subheader("📖 Text Preview")
	st.text_area("Content Preview", result['text_preview'], height=150, disabled=True)

	# Display all classification scores
	st.subheader("📊 Classification Scores")
	scores_df = pd.DataFrame(list(result['all_scores'].items()), columns=['Document Type', 'Score'])
	scores_df['Score'] = scores_df['Score'].round(4)
	scores_df = scores_df.sort_values('Score', ascending=False)

	# Create a bar chart
	fig = px.bar(scores_df, x='Document Type', y='Score',
	title="Classification Confidence Scores",
	color='Score',
	color_continuous_scale='Blues')
	fig.update_layout(xaxis_tickangle=-45)
	st.plotly_chart(fig, use_container_width=True)

	# Display scores table
	st.dataframe(scores_df, use_container_width=True)

	def display_batch_results(results: List[Dict]):
	"""Display batch classification results."""
	if not results:
	st.warning("No results to display.")
	return

	# Summary statistics
	successful_results = [r for r in results if r.get('success', False)]
	failed_results = [r for r in results if not r.get('success', False)]

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Total Files", len(results))

	with col2:
	st.metric("Successful", len(successful_results))

	with col3:
	st.metric("Failed", len(failed_results))

	with col4:
	if successful_results:
	avg_confidence = sum(r['confidence'] for r in successful_results) / len(successful_results)
	st.metric("Avg Confidence", f"{avg_confidence:.2%}")

	# Classification distribution
	if successful_results:
	st.subheader("📊 Classification Distribution")
	classifications = [r['classification'] for r in successful_results]
	classification_counts = pd.Series(classifications).value_counts()

	fig = px.pie(values=classification_counts.values,
	names=classification_counts.index,
	title="Document Type Distribution")
	st.plotly_chart(fig, use_container_width=True)

	# Detailed results table
	st.subheader("📋 Detailed Results")

	if successful_results:
	results_data = []
	for result in successful_results:
	results_data.append({
	'File Name': result['file_name'],
	'File Type': result['file_type'],
	'Classification': result['classification'].title(),
	'Confidence': f"{result['confidence']:.2%}",
	'Content Length': result['content_length']
	})

	results_df = pd.DataFrame(results_data)
	st.dataframe(results_df, use_container_width=True)

	# Show failed results
	if failed_results:
	st.subheader("❌ Failed Classifications")
	for result in failed_results:
	st.error(f"{result.get('file_name', 'Unknown')}: {result.get('error', 'Unknown error')}")

	def main():
	"""Main Streamlit application."""

	# Header
	st.markdown('<h1 class="main-header">📄 Document Classifier</h1>', unsafe_allow_html=True)
	st.markdown("""
	<div style="text-align: center; margin-bottom: 2rem;">
	<p style="font-size: 1.2rem; color: #666;">
	Classify documents using Hugging Face models and content analysis
	</p>
	</div>
	""", unsafe_allow_html=True)

	# Sidebar
	st.sidebar.title("⚙️ Settings")

	# Initialize classifier
	if st.sidebar.button("🔄 Initialize Classifier", type="primary"):
	initialize_classifier()

	# Model information
	st.sidebar.subheader("🤖 Model Information")
	st.sidebar.info("""
	Models Used:
	- Cardiff NLP Twitter RoBERTa Base Emotion
	- DistilBERT Base Uncased (fallback)

	Supported Formats:
	- PDF, DOCX, DOC
	- TXT, CSV
	- XLSX, XLS
	- Images (JPG, PNG, etc.)
	""")

	# Main content
	tab1, tab2, tab3 = st.tabs(["📁 Single File", "📂 Batch Upload", "📊 Results"])

	with tab1:
	st.subheader("\U0001F4C1 Classify Single Document")
	sample_scan_folder = os.path.join(os.path.dirname(__file__), "sample_scans")
	sample_files = [f for f in os.listdir(sample_scan_folder) if f.lower().endswith((".png", ".jpg", ".jpeg"))]
	sample_files.sort()
	selected_sample = st.selectbox("Or select from example scans:", ["--- Select a sample ---"] + sample_files)
	if selected_sample != "--- Select a sample ---":
	sample_file_path = os.path.join(sample_scan_folder, selected_sample)
	st.image(sample_file_path, caption=f"Sample: {selected_sample}", use_column_width=True)
	if st.button("🔍 Classify Sample Scan", key="classify_sample_scan"):
	with st.spinner("Calling API to classify sample scan..."):
	result = classify_with_api(sample_file_path)
	display_classification_result(result)

	uploaded_file = st.file_uploader(
	"Choose a document file",
	type=['pdf', 'docx', 'doc', 'txt', 'csv', 'xlsx', 'xls', 'jpg', 'jpeg', 'png'],
	help="Upload a document to classify its type and content"
	)

	if uploaded_file is not None:
	if st.button("🔍 Classify Document", type="primary"):
	if not initialize_classifier():
	st.stop()

	# Save uploaded file
	file_path = save_uploaded_file(uploaded_file)
	if file_path:
	with st.spinner("Classifying document..."):
	result = classify_single_file(file_path)
	st.session_state.classification_results = [result]

	# Clean up temporary file
	try:
	os.unlink(file_path)
	except:
	pass

	# Display result
	display_classification_result(result)

	with tab2:
	st.subheader("📂 Batch Document Classification")

	uploaded_files = st.file_uploader(
	"Choose multiple document files",
	type=['pdf', 'docx', 'doc', 'txt', 'csv', 'xlsx', 'xls', 'jpg', 'jpeg', 'png'],
	accept_multiple_files=True,
	help="Upload multiple documents to classify them in batch"
	)

	if uploaded_files:
	st.write(f"📁 {len(uploaded_files)} files selected")

	if st.button("🔍 Classify All Documents", type="primary"):
	if not initialize_classifier():
	st.stop()

	# Save uploaded files
	file_paths = []
	for uploaded_file in uploaded_files:
	file_path = save_uploaded_file(uploaded_file)
	if file_path:
	file_paths.append(file_path)

	if file_paths:
	progress_bar = st.progress(0)
	status_text = st.empty()

	results = []
	for i, file_path in enumerate(file_paths):
	status_text.text(f"Processing file {i+1}/{len(file_paths)}: {os.path.basename(file_path)}")
	result = classify_single_file(file_path)
	results.append(result)
	progress_bar.progress((i + 1) / len(file_paths))

	# Clean up temporary file
	try:
	os.unlink(file_path)
	except:
	pass

	st.session_state.classification_results = results
	status_text.text("✅ Classification complete!")

	# Display batch results
	display_batch_results(results)

	with tab3:
	st.subheader("📊 Classification Results")

	if st.session_state.classification_results:
	if len(st.session_state.classification_results) == 1:
	display_classification_result(st.session_state.classification_results[0])
	else:
	display_batch_results(st.session_state.classification_results)
	else:
	st.info("👆 Upload and classify documents to see results here.")

	# Export results
	if st.session_state.classification_results:
	st.subheader("💾 Export Results")

	col1, col2 = st.columns(2)

	with col1:
	if st.button("📄 Export as CSV"):
	successful_results = [r for r in st.session_state.classification_results if r.get('success', False)]
	if successful_results:
	export_data = []
	for result in successful_results:
	export_data.append({
	'File Name': result['file_name'],
	'File Type': result['file_type'],
	'Classification': result['classification'],
	'Confidence': result['confidence'],
	'Content Length': result['content_length']
	})

	df = pd.DataFrame(export_data)
	csv = df.to_csv(index=False)
	st.download_button(
	label="Download CSV",
	data=csv,
	file_name="classification_results.csv",
	mime="text/csv"
	)

	with col2:
	if st.button("📋 Export as JSON"):
	json_data = json.dumps(st.session_state.classification_results, indent=2)
	st.download_button(
	label="Download JSON",
	data=json_data,
	file_name="classification_results.json",
	mime="application/json"
	)

	if __name__ == "__main__":
	main()