extract_selva_ui / src /streamlit_app.py
sandtemp01's picture
Update src/streamlit_app.py
adda851 verified
# streamlit run extraction_ui.py
import streamlit as st
import requests
import fitz # PyMuPDF
import docx
from io import BytesIO
import re
from datetime import datetime
from typing import Dict, List, Tuple
# Set page configuration
st.set_page_config(
page_title="PDF Entity Extractor",
page_icon="📄",
layout="wide",
initial_sidebar_state="collapsed"
)
# Custom CSS for professional styling
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
color: #1E3A8A;
text-align: center;
margin-bottom: 1rem;
font-weight: 600;
}
.section-header {
font-size: 1.5rem;
color: #374151;
margin-bottom: 1rem;
font-weight: 600;
border-bottom: 2px solid #E5E7EB;
padding-bottom: 0.5rem;
}
.info-box {
background-color: #F3F4F6;
padding: 1.5rem;
border-radius: 8px;
border-left: 4px solid #3B82F6;
margin-bottom: 1.5rem;
}
.success-box {
background-color: #D1FAE5;
padding: 1rem;
border-radius: 6px;
border-left: 4px solid #10B981;
margin: 1rem 0;
}
.entity-card {
background-color: white;
border: 1px solid #E5E7EB;
border-radius: 8px;
padding: 1rem;
margin-bottom: 1rem;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
}
.stButton button {
background-color: #2563EB;
color: white;
border: none;
padding: 0.5rem 1rem;
border-radius: 6px;
font-weight: 500;
transition: background-color 0.3s;
height: 44px;
margin-top: 1.6rem;
}
.stButton button:hover {
background-color: #1D4ED8;
}
.clear-btn button {
background-color: #EF4444 !important;
height: 44px;
margin-top: 1.6rem;
}
.clear-btn button:hover {
background-color: #DC2626 !important;
}
.download-btn {
background-color: #10B981 !important;
}
.download-btn:hover {
background-color: #0DA271 !important;
}
.stDownloadButton button {
width: 100%;
}
.stExpander {
border: 1px solid #E5E7EB;
border-radius: 8px;
margin-bottom: 0.5rem;
}
.entity-item {
padding: 0.5rem 0;
border-bottom: 1px solid #F3F4F6;
}
.entity-item:last-child {
border-bottom: none;
}
.url-input-container {
margin-bottom: 1rem;
}
</style>
""", unsafe_allow_html=True)
# Initialize session state
if 'extracted_text' not in st.session_state:
st.session_state.extracted_text = ""
if 'entities' not in st.session_state:
st.session_state.entities = {}
if 'docx_buffer' not in st.session_state:
st.session_state.docx_buffer = None
if 'pdf_bytes' not in st.session_state:
st.session_state.pdf_bytes = None
if 'filename' not in st.session_state:
st.session_state.filename = ""
if 'show_entities' not in st.session_state:
st.session_state.show_entities = False
if 'pdf_preview_buffer' not in st.session_state:
st.session_state.pdf_preview_buffer = None
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
"""Extract text from PDF bytes."""
text = ""
try:
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
for page in doc:
text += page.get_text()
except Exception as e:
st.error(f"Error extracting text from PDF: {str(e)}")
return text
def extract_entities(text: str) -> Dict[str, List[str]]:
"""Extract entities from text using regex patterns."""
entities = {
"Dates": [],
"Email Addresses": [],
"Phone Numbers": [],
"URLs": [],
"Monetary Values": [],
"Names": [],
"Organizations": []
}
# Date patterns
date_patterns = [
r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b',
r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b'
]
# Email pattern
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
# Phone number patterns
phone_patterns = [
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
r'\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b',
]
# URL pattern
url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.-]*'
# Monetary values pattern
money_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\b\d+\s*(?:dollars|USD)\b'
# Name pattern (simplified)
name_pattern = r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b'
# Organization patterns
org_patterns = [
r'\b(?:Inc\.|LLC|Ltd\.|Corp\.|Company|Co\.|Corporation|Incorporated)\b',
r'\b[A-Z][a-z]+\s+(?:Inc|LLC|Ltd|Corp|Co)\b'
]
# Extract dates
for pattern in date_patterns:
entities["Dates"].extend(re.findall(pattern, text, re.IGNORECASE))
# Extract emails
entities["Email Addresses"].extend(re.findall(email_pattern, text, re.IGNORECASE))
# Extract phone numbers
for pattern in phone_patterns:
entities["Phone Numbers"].extend(re.findall(pattern, text))
# Extract URLs
entities["URLs"].extend(re.findall(url_pattern, text, re.IGNORECASE))
# Extract monetary values
entities["Monetary Values"].extend(re.findall(money_pattern, text, re.IGNORECASE))
# Extract names
entities["Names"].extend(re.findall(name_pattern, text))
# Extract organizations
for pattern in org_patterns:
entities["Organizations"].extend(re.findall(pattern, text, re.IGNORECASE))
# Remove duplicates and sort each entity list
for key in entities:
entities[key] = sorted(list(set(entities[key])))
return entities
def create_word_document(entities: Dict[str, List[str]], filename: str) -> BytesIO:
"""Create a Word document from extracted entities."""
doc = docx.Document()
# Add title
title = doc.add_heading('Extracted Entities Report', 0)
title.alignment = 1 # Center alignment
# Add metadata
doc.add_paragraph(f"Source Document: {filename}")
doc.add_paragraph(f"Extraction Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
doc.add_paragraph("")
# Add summary
doc.add_heading('Summary', level=1)
summary_table = doc.add_table(rows=1, cols=2)
summary_table.style = 'LightShading-Accent1'
hdr_cells = summary_table.rows[0].cells
hdr_cells[0].text = 'Entity Type'
hdr_cells[1].text = 'Count'
total_entities = 0
for entity_type, values in entities.items():
if values:
row_cells = summary_table.add_row().cells
row_cells[0].text = entity_type
row_cells[1].text = str(len(values))
total_entities += len(values)
doc.add_paragraph(f"\nTotal Entities Extracted: {total_entities}")
doc.add_paragraph("")
# Add detailed entities
doc.add_heading('Detailed Entities', level=1)
for entity_type, values in entities.items():
if values:
doc.add_heading(entity_type, level=2)
for value in values:
doc.add_paragraph(f"• {value}", style='ListBullet')
doc.add_paragraph()
# Save to bytes buffer
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
return buffer
def create_pdf_preview(entities: Dict[str, List[str]], filename: str) -> BytesIO:
"""Create a simple PDF preview using PyMuPDF."""
buffer = BytesIO()
# Create a new PDF document
doc = fitz.open()
# Add a page
page = doc.new_page()
# Define margins and starting position
margin = 50
x = margin
y = margin
line_height = 14
page_height = 800
# Title
title = "Extracted Entities Preview"
page.insert_text((x, y), title, fontsize=16, color=(0.12, 0.23, 0.54)) # #1E3A8A
y += 40
# Metadata
page.insert_text((x, y), f"Source: {filename}", fontsize=10)
y += line_height * 1.5
page.insert_text((x, y), f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", fontsize=10)
y += 30
# Summary section
page.insert_text((x, y), "Summary", fontsize=12, color=(0.22, 0.26, 0.32)) # #374151
y += 25
# Draw summary table header
page.draw_rect((x, y, x + 300, y + 25), fill=(0.23, 0.51, 0.96)) # #3B82F6
page.insert_text((x + 10, y + 18), "Entity Type", fontsize=10, color=(1, 1, 1))
page.insert_text((x + 210, y + 18), "Count", fontsize=10, color=(1, 1, 1))
y += 25
total_entities = 0
row_index = 0
# Add entity rows
for entity_type, values in entities.items():
if values:
# Alternate row colors
if row_index % 2 == 0:
fill_color = (0.95, 0.96, 0.97) # Light gray
else:
fill_color = (1, 1, 1) # White
page.draw_rect((x, y, x + 300, y + 20), fill=fill_color)
page.insert_text((x + 10, y + 13), entity_type, fontsize=10)
page.insert_text((x + 210, y + 13), str(len(values)), fontsize=10)
y += 20
total_entities += len(values)
row_index += 1
# Add total row
y += 5
page.draw_rect((x, y, x + 300, y + 25), fill=(0.95, 0.96, 0.97))
page.insert_text((x + 10, y + 18), "Total", fontsize=10, fontname="helv", color=(0, 0, 0))
page.insert_text((x + 210, y + 18), str(total_entities), fontsize=10, fontname="helv", color=(0, 0, 0))
y += 40
# Detailed entities section
if y > page_height - 100:
page = doc.new_page()
x = margin
y = margin
page.insert_text((x, y), "Detailed Entities", fontsize=12, color=(0.22, 0.26, 0.32))
y += 30
# Add each entity type with values
for entity_type, values in entities.items():
if values:
if y > page_height - 50:
page = doc.new_page()
x = margin
y = margin
page.insert_text((x, y), f"{entity_type}:", fontsize=11, color=(0.12, 0.23, 0.54))
y += 20
for value in values:
if y > page_height - 30:
page = doc.new_page()
x = margin
y = margin
page.insert_text((x + 20, y), f"• {value}", fontsize=10)
y += line_height
y += 10
# Save to buffer
doc.save(buffer)
doc.close()
buffer.seek(0)
return buffer
def display_pdf_viewer(pdf_bytes: bytes, filename: str, preview: bool = False):
"""Display PDF in the viewer."""
try:
# Convert PDF to images for display
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
# Display first page as preview
page = pdf_document[0]
pix = page.get_pixmap(dpi=150)
# Convert to bytes for display
img_bytes = pix.tobytes("png")
# Display the PDF page as image
st.image(img_bytes, use_container_width=True)
# Show document info
col1, col2 = st.columns(2)
with col1:
display_name = "Entities Preview.pdf" if preview else filename
st.caption(f"**File:** {display_name}")
with col2:
st.caption(f"**Pages:** {len(pdf_document)}")
pdf_document.close()
except Exception as e:
st.error(f"Error displaying PDF: {str(e)}")
def fetch_pdf_from_url(url: str) -> Tuple[bool, bytes]:
"""Fetch PDF from URL."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Check if it's a PDF
content_type = response.headers.get('content-type', '').lower()
if 'pdf' in content_type or url.lower().endswith('.pdf'):
return True, response.content
else:
st.warning("The URL does not appear to point to a PDF file.")
return False, b""
except requests.exceptions.RequestException as e:
st.error(f"Error fetching PDF: {str(e)}")
return False, b""
def clear_session():
"""Clear session state."""
st.session_state.extracted_text = ""
st.session_state.entities = {}
st.session_state.docx_buffer = None
st.session_state.pdf_bytes = None
st.session_state.filename = ""
st.session_state.show_entities = False
st.session_state.pdf_preview_buffer = None
# Main application
def main():
# Header
st.markdown('<div class="main-header">PDF Entity Extractor</div>', unsafe_allow_html=True)
# Top section for URL input with aligned buttons
st.markdown("### Fetch PDF from URL")
# Create columns for aligned layout
url_col1, url_col2, url_col3 = st.columns([4, 1, 1])
with url_col1:
url_input = st.text_input(
"Enter PDF URL",
placeholder="https://example.com/document.pdf",
label_visibility="collapsed",
key="url_input"
)
with url_col2:
fetch_btn = st.button("Fetch", use_container_width=True, type="primary")
with url_col3:
if st.button("Clear All", use_container_width=True, type="secondary"):
clear_session()
st.rerun()
# File upload section
st.markdown("---")
uploaded_file = st.file_uploader(
"Upload PDF File",
type=["pdf"],
help="Select a PDF file from your device",
label_visibility="collapsed"
)
# Handle URL fetch
if fetch_btn and url_input:
with st.spinner("Fetching PDF document..."):
success, pdf_bytes = fetch_pdf_from_url(url_input)
if success:
st.session_state.pdf_bytes = pdf_bytes
st.session_state.filename = url_input.split("/")[-1] or "document.pdf"
st.success("PDF document fetched successfully")
st.session_state.show_entities = False
# Handle file upload
elif uploaded_file is not None:
st.session_state.pdf_bytes = uploaded_file.getvalue()
st.session_state.filename = uploaded_file.name
st.session_state.show_entities = False
# Process PDF if available
if st.session_state.pdf_bytes:
# Create two columns
col1, col2 = st.columns([1, 1], gap="large")
with col1:
st.markdown('<div class="section-header">PDF Document</div>', unsafe_allow_html=True)
display_pdf_viewer(st.session_state.pdf_bytes, st.session_state.filename)
# Download button for PDF
st.download_button(
label="Download PDF",
data=st.session_state.pdf_bytes,
file_name=st.session_state.filename,
mime="application/pdf",
use_container_width=True,
key="pdf_download"
)
with col2:
st.markdown('<div class="section-header">Entity Extraction</div>', unsafe_allow_html=True)
# Extract entities button
if st.button("Extract Entities", use_container_width=True, type="primary"):
with st.spinner("Processing document..."):
text = extract_text_from_pdf(st.session_state.pdf_bytes)
if text.strip():
entities = extract_entities(text)
st.session_state.extracted_text = text
st.session_state.entities = entities
# Create Word document
docx_buffer = create_word_document(entities, st.session_state.filename)
st.session_state.docx_buffer = docx_buffer
# Create PDF preview
pdf_preview_buffer = create_pdf_preview(entities, st.session_state.filename)
st.session_state.pdf_preview_buffer = pdf_preview_buffer
st.session_state.show_entities = True
st.success(f"Extracted {sum(len(v) for v in entities.values())} entities")
else:
st.error("Could not extract text from PDF. The document may be scanned or contain only images.")
# Display entities if available
if st.session_state.show_entities and st.session_state.entities:
if any(st.session_state.entities.values()):
st.markdown("### Extracted Entities Preview")
# Display PDF preview
if st.session_state.pdf_preview_buffer:
display_pdf_viewer(st.session_state.pdf_preview_buffer, "", preview=True)
# Download Word document button
if st.session_state.docx_buffer:
st.markdown("### Download Report")
doc_name = f"{st.session_state.filename.rsplit('.', 1)[0]}_entities_report.docx"
st.download_button(
label="Download Word Report",
data=st.session_state.docx_buffer,
file_name=doc_name,
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
use_container_width=True,
key="docx_download"
)
st.caption("Note: The preview above is a PDF. The download will be a Word document containing the complete report.")
else:
st.info("No entities found in the document.")
else:
st.info("Click 'Extract Entities' to begin entity extraction.")
else:
# Show instructions when no PDF is loaded
st.markdown('<div class="info-box">', unsafe_allow_html=True)
st.markdown("""
### Welcome to PDF Entity Extractor
**To get started, you can either:**
1. **Enter a URL** - Provide a direct link to a PDF file above
2. **Upload a PDF** - Use the upload area below to select a file from your device
**What this tool does:**
- Extracts entities like dates, email addresses, phone numbers, URLs, monetary values, names, and organizations
- Generates a comprehensive Word document report
- Provides a PDF preview of extracted entities
**Supported formats:** PDF files with extractable text
""")
st.markdown('</div>', unsafe_allow_html=True)
if __name__ == "__main__":
main()