Spaces:

sandtemp01
/

extract_selva_ui

Sleeping

App Files Files Community

extract_selva_ui / src /streamlit_app.py

sandtemp01

Update src/streamlit_app.py

adda851 verified 28 days ago

raw

history blame contribute delete

19.2 kB

	# streamlit run extraction_ui.py
	import streamlit as st
	import requests
	import fitz # PyMuPDF
	import docx
	from io import BytesIO
	import re
	from datetime import datetime
	from typing import Dict, List, Tuple

	# Set page configuration
	st.set_page_config(
	page_title="PDF Entity Extractor",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Custom CSS for professional styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	color: #1E3A8A;
	text-align: center;
	margin-bottom: 1rem;
	font-weight: 600;
	}
	.section-header {
	font-size: 1.5rem;
	color: #374151;
	margin-bottom: 1rem;
	font-weight: 600;
	border-bottom: 2px solid #E5E7EB;
	padding-bottom: 0.5rem;
	}
	.info-box {
	background-color: #F3F4F6;
	padding: 1.5rem;
	border-radius: 8px;
	border-left: 4px solid #3B82F6;
	margin-bottom: 1.5rem;
	}
	.success-box {
	background-color: #D1FAE5;
	padding: 1rem;
	border-radius: 6px;
	border-left: 4px solid #10B981;
	margin: 1rem 0;
	}
	.entity-card {
	background-color: white;
	border: 1px solid #E5E7EB;
	border-radius: 8px;
	padding: 1rem;
	margin-bottom: 1rem;
	box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
	}
	.stButton button {
	background-color: #2563EB;
	color: white;
	border: none;
	padding: 0.5rem 1rem;
	border-radius: 6px;
	font-weight: 500;
	transition: background-color 0.3s;
	height: 44px;
	margin-top: 1.6rem;
	}
	.stButton button:hover {
	background-color: #1D4ED8;
	}
	.clear-btn button {
	background-color: #EF4444 !important;
	height: 44px;
	margin-top: 1.6rem;
	}
	.clear-btn button:hover {
	background-color: #DC2626 !important;
	}
	.download-btn {
	background-color: #10B981 !important;
	}
	.download-btn:hover {
	background-color: #0DA271 !important;
	}
	.stDownloadButton button {
	width: 100%;
	}
	.stExpander {
	border: 1px solid #E5E7EB;
	border-radius: 8px;
	margin-bottom: 0.5rem;
	}
	.entity-item {
	padding: 0.5rem 0;
	border-bottom: 1px solid #F3F4F6;
	}
	.entity-item:last-child {
	border-bottom: none;
	}
	.url-input-container {
	margin-bottom: 1rem;
	}
	</style>
	""", unsafe_allow_html=True)

	# Initialize session state
	if 'extracted_text' not in st.session_state:
	st.session_state.extracted_text = ""
	if 'entities' not in st.session_state:
	st.session_state.entities = {}
	if 'docx_buffer' not in st.session_state:
	st.session_state.docx_buffer = None
	if 'pdf_bytes' not in st.session_state:
	st.session_state.pdf_bytes = None
	if 'filename' not in st.session_state:
	st.session_state.filename = ""
	if 'show_entities' not in st.session_state:
	st.session_state.show_entities = False
	if 'pdf_preview_buffer' not in st.session_state:
	st.session_state.pdf_preview_buffer = None

	def extract_text_from_pdf(pdf_bytes: bytes) -> str:
	"""Extract text from PDF bytes."""
	text = ""
	try:
	with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
	for page in doc:
	text += page.get_text()
	except Exception as e:
	st.error(f"Error extracting text from PDF: {str(e)}")
	return text

	def extract_entities(text: str) -> Dict[str, List[str]]:
	"""Extract entities from text using regex patterns."""
	entities = {
	"Dates": [],
	"Email Addresses": [],
	"Phone Numbers": [],
	"URLs": [],
	"Monetary Values": [],
	"Names": [],
	"Organizations": []
	}

	# Date patterns
	date_patterns = [
	r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
	r'\b(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]* \d{1,2},? \d{4}\b',
	r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b'
	]

	# Email pattern
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'

	# Phone number patterns
	phone_patterns = [
	r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
	r'\b$\d{3}$\s*\d{3}[-.]?\d{4}\b',
	]

	# URL pattern
	url_pattern = r'https?://(?:[-\w.]\|(?:%[\da-fA-F]{2}))+[/\w\.-]*'

	# Monetary values pattern
	money_pattern = r'\$\d{1,3}(?:,\d{3})(?:\.\d{2})?\|\b\d+\s(?:dollars\|USD)\b'

	# Name pattern (simplified)
	name_pattern = r'\b(?:Mr\.\|Ms\.\|Mrs\.\|Dr\.)\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b'

	# Organization patterns
	org_patterns = [
	r'\b(?:Inc\.\|LLC\|Ltd\.\|Corp\.\|Company\|Co\.\|Corporation\|Incorporated)\b',
	r'\b[A-Z][a-z]+\s+(?:Inc\|LLC\|Ltd\|Corp\|Co)\b'
	]

	# Extract dates
	for pattern in date_patterns:
	entities["Dates"].extend(re.findall(pattern, text, re.IGNORECASE))

	# Extract emails
	entities["Email Addresses"].extend(re.findall(email_pattern, text, re.IGNORECASE))

	# Extract phone numbers
	for pattern in phone_patterns:
	entities["Phone Numbers"].extend(re.findall(pattern, text))

	# Extract URLs
	entities["URLs"].extend(re.findall(url_pattern, text, re.IGNORECASE))

	# Extract monetary values
	entities["Monetary Values"].extend(re.findall(money_pattern, text, re.IGNORECASE))

	# Extract names
	entities["Names"].extend(re.findall(name_pattern, text))

	# Extract organizations
	for pattern in org_patterns:
	entities["Organizations"].extend(re.findall(pattern, text, re.IGNORECASE))

	# Remove duplicates and sort each entity list
	for key in entities:
	entities[key] = sorted(list(set(entities[key])))

	return entities

	def create_word_document(entities: Dict[str, List[str]], filename: str) -> BytesIO:
	"""Create a Word document from extracted entities."""
	doc = docx.Document()

	# Add title
	title = doc.add_heading('Extracted Entities Report', 0)
	title.alignment = 1 # Center alignment

	# Add metadata
	doc.add_paragraph(f"Source Document: {filename}")
	doc.add_paragraph(f"Extraction Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	doc.add_paragraph("")

	# Add summary
	doc.add_heading('Summary', level=1)
	summary_table = doc.add_table(rows=1, cols=2)
	summary_table.style = 'LightShading-Accent1'
	hdr_cells = summary_table.rows[0].cells
	hdr_cells[0].text = 'Entity Type'
	hdr_cells[1].text = 'Count'

	total_entities = 0
	for entity_type, values in entities.items():
	if values:
	row_cells = summary_table.add_row().cells
	row_cells[0].text = entity_type
	row_cells[1].text = str(len(values))
	total_entities += len(values)

	doc.add_paragraph(f"\nTotal Entities Extracted: {total_entities}")
	doc.add_paragraph("")

	# Add detailed entities
	doc.add_heading('Detailed Entities', level=1)

	for entity_type, values in entities.items():
	if values:
	doc.add_heading(entity_type, level=2)
	for value in values:
	doc.add_paragraph(f"• {value}", style='ListBullet')
	doc.add_paragraph()

	# Save to bytes buffer
	buffer = BytesIO()
	doc.save(buffer)
	buffer.seek(0)
	return buffer

	def create_pdf_preview(entities: Dict[str, List[str]], filename: str) -> BytesIO:
	"""Create a simple PDF preview using PyMuPDF."""
	buffer = BytesIO()

	# Create a new PDF document
	doc = fitz.open()

	# Add a page
	page = doc.new_page()

	# Define margins and starting position
	margin = 50
	x = margin
	y = margin
	line_height = 14
	page_height = 800

	# Title
	title = "Extracted Entities Preview"
	page.insert_text((x, y), title, fontsize=16, color=(0.12, 0.23, 0.54)) # #1E3A8A
	y += 40

	# Metadata
	page.insert_text((x, y), f"Source: {filename}", fontsize=10)
	y += line_height * 1.5

	page.insert_text((x, y), f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", fontsize=10)
	y += 30

	# Summary section
	page.insert_text((x, y), "Summary", fontsize=12, color=(0.22, 0.26, 0.32)) # #374151
	y += 25

	# Draw summary table header
	page.draw_rect((x, y, x + 300, y + 25), fill=(0.23, 0.51, 0.96)) # #3B82F6
	page.insert_text((x + 10, y + 18), "Entity Type", fontsize=10, color=(1, 1, 1))
	page.insert_text((x + 210, y + 18), "Count", fontsize=10, color=(1, 1, 1))
	y += 25

	total_entities = 0
	row_index = 0

	# Add entity rows
	for entity_type, values in entities.items():
	if values:
	# Alternate row colors
	if row_index % 2 == 0:
	fill_color = (0.95, 0.96, 0.97) # Light gray
	else:
	fill_color = (1, 1, 1) # White

	page.draw_rect((x, y, x + 300, y + 20), fill=fill_color)
	page.insert_text((x + 10, y + 13), entity_type, fontsize=10)
	page.insert_text((x + 210, y + 13), str(len(values)), fontsize=10)
	y += 20
	total_entities += len(values)
	row_index += 1

	# Add total row
	y += 5
	page.draw_rect((x, y, x + 300, y + 25), fill=(0.95, 0.96, 0.97))
	page.insert_text((x + 10, y + 18), "Total", fontsize=10, fontname="helv", color=(0, 0, 0))
	page.insert_text((x + 210, y + 18), str(total_entities), fontsize=10, fontname="helv", color=(0, 0, 0))
	y += 40

	# Detailed entities section
	if y > page_height - 100:
	page = doc.new_page()
	x = margin
	y = margin

	page.insert_text((x, y), "Detailed Entities", fontsize=12, color=(0.22, 0.26, 0.32))
	y += 30

	# Add each entity type with values
	for entity_type, values in entities.items():
	if values:
	if y > page_height - 50:
	page = doc.new_page()
	x = margin
	y = margin

	page.insert_text((x, y), f"{entity_type}:", fontsize=11, color=(0.12, 0.23, 0.54))
	y += 20

	for value in values:
	if y > page_height - 30:
	page = doc.new_page()
	x = margin
	y = margin

	page.insert_text((x + 20, y), f"• {value}", fontsize=10)
	y += line_height

	y += 10

	# Save to buffer
	doc.save(buffer)
	doc.close()
	buffer.seek(0)
	return buffer

	def display_pdf_viewer(pdf_bytes: bytes, filename: str, preview: bool = False):
	"""Display PDF in the viewer."""
	try:
	# Convert PDF to images for display
	pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")

	# Display first page as preview
	page = pdf_document[0]
	pix = page.get_pixmap(dpi=150)

	# Convert to bytes for display
	img_bytes = pix.tobytes("png")

	# Display the PDF page as image
	st.image(img_bytes, use_container_width=True)

	# Show document info
	col1, col2 = st.columns(2)
	with col1:
	display_name = "Entities Preview.pdf" if preview else filename
	st.caption(f"File: {display_name}")
	with col2:
	st.caption(f"Pages: {len(pdf_document)}")

	pdf_document.close()

	except Exception as e:
	st.error(f"Error displaying PDF: {str(e)}")

	def fetch_pdf_from_url(url: str) -> Tuple[bool, bytes]:
	"""Fetch PDF from URL."""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Check if it's a PDF
	content_type = response.headers.get('content-type', '').lower()
	if 'pdf' in content_type or url.lower().endswith('.pdf'):
	return True, response.content
	else:
	st.warning("The URL does not appear to point to a PDF file.")
	return False, b""
	except requests.exceptions.RequestException as e:
	st.error(f"Error fetching PDF: {str(e)}")
	return False, b""

	def clear_session():
	"""Clear session state."""
	st.session_state.extracted_text = ""
	st.session_state.entities = {}
	st.session_state.docx_buffer = None
	st.session_state.pdf_bytes = None
	st.session_state.filename = ""
	st.session_state.show_entities = False
	st.session_state.pdf_preview_buffer = None

	# Main application
	def main():
	# Header
	st.markdown('<div class="main-header">PDF Entity Extractor</div>', unsafe_allow_html=True)

	# Top section for URL input with aligned buttons
	st.markdown("### Fetch PDF from URL")

	# Create columns for aligned layout
	url_col1, url_col2, url_col3 = st.columns([4, 1, 1])

	with url_col1:
	url_input = st.text_input(
	"Enter PDF URL",
	placeholder="https://example.com/document.pdf",
	label_visibility="collapsed",
	key="url_input"
	)

	with url_col2:
	fetch_btn = st.button("Fetch", use_container_width=True, type="primary")

	with url_col3:
	if st.button("Clear All", use_container_width=True, type="secondary"):
	clear_session()
	st.rerun()

	# File upload section
	st.markdown("---")
	uploaded_file = st.file_uploader(
	"Upload PDF File",
	type=["pdf"],
	help="Select a PDF file from your device",
	label_visibility="collapsed"
	)

	# Handle URL fetch
	if fetch_btn and url_input:
	with st.spinner("Fetching PDF document..."):
	success, pdf_bytes = fetch_pdf_from_url(url_input)
	if success:
	st.session_state.pdf_bytes = pdf_bytes
	st.session_state.filename = url_input.split("/")[-1] or "document.pdf"
	st.success("PDF document fetched successfully")
	st.session_state.show_entities = False

	# Handle file upload
	elif uploaded_file is not None:
	st.session_state.pdf_bytes = uploaded_file.getvalue()
	st.session_state.filename = uploaded_file.name
	st.session_state.show_entities = False

	# Process PDF if available
	if st.session_state.pdf_bytes:
	# Create two columns
	col1, col2 = st.columns([1, 1], gap="large")

	with col1:
	st.markdown('<div class="section-header">PDF Document</div>', unsafe_allow_html=True)
	display_pdf_viewer(st.session_state.pdf_bytes, st.session_state.filename)

	# Download button for PDF
	st.download_button(
	label="Download PDF",
	data=st.session_state.pdf_bytes,
	file_name=st.session_state.filename,
	mime="application/pdf",
	use_container_width=True,
	key="pdf_download"
	)

	with col2:
	st.markdown('<div class="section-header">Entity Extraction</div>', unsafe_allow_html=True)

	# Extract entities button
	if st.button("Extract Entities", use_container_width=True, type="primary"):
	with st.spinner("Processing document..."):
	text = extract_text_from_pdf(st.session_state.pdf_bytes)
	if text.strip():
	entities = extract_entities(text)
	st.session_state.extracted_text = text
	st.session_state.entities = entities

	# Create Word document
	docx_buffer = create_word_document(entities, st.session_state.filename)
	st.session_state.docx_buffer = docx_buffer

	# Create PDF preview
	pdf_preview_buffer = create_pdf_preview(entities, st.session_state.filename)
	st.session_state.pdf_preview_buffer = pdf_preview_buffer

	st.session_state.show_entities = True
	st.success(f"Extracted {sum(len(v) for v in entities.values())} entities")
	else:
	st.error("Could not extract text from PDF. The document may be scanned or contain only images.")

	# Display entities if available
	if st.session_state.show_entities and st.session_state.entities:
	if any(st.session_state.entities.values()):
	st.markdown("### Extracted Entities Preview")

	# Display PDF preview
	if st.session_state.pdf_preview_buffer:
	display_pdf_viewer(st.session_state.pdf_preview_buffer, "", preview=True)

	# Download Word document button
	if st.session_state.docx_buffer:
	st.markdown("### Download Report")
	doc_name = f"{st.session_state.filename.rsplit('.', 1)[0]}_entities_report.docx"

	st.download_button(
	label="Download Word Report",
	data=st.session_state.docx_buffer,
	file_name=doc_name,
	mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	use_container_width=True,
	key="docx_download"
	)

	st.caption("Note: The preview above is a PDF. The download will be a Word document containing the complete report.")

	else:
	st.info("No entities found in the document.")
	else:
	st.info("Click 'Extract Entities' to begin entity extraction.")

	else:
	# Show instructions when no PDF is loaded
	st.markdown('<div class="info-box">', unsafe_allow_html=True)
	st.markdown("""
	### Welcome to PDF Entity Extractor

	To get started, you can either:
	1. Enter a URL - Provide a direct link to a PDF file above
	2. Upload a PDF - Use the upload area below to select a file from your device

	What this tool does:
	- Extracts entities like dates, email addresses, phone numbers, URLs, monetary values, names, and organizations
	- Generates a comprehensive Word document report
	- Provides a PDF preview of extracted entities

	Supported formats: PDF files with extractable text
	""")
	st.markdown('</div>', unsafe_allow_html=True)

	if __name__ == "__main__":
	main()