# streamlit run extraction_ui.py import streamlit as st import requests import fitz # PyMuPDF import docx from io import BytesIO import re from datetime import datetime from typing import Dict, List, Tuple # Set page configuration st.set_page_config( page_title="PDF Entity Extractor", page_icon="📄", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for professional styling st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'extracted_text' not in st.session_state: st.session_state.extracted_text = "" if 'entities' not in st.session_state: st.session_state.entities = {} if 'docx_buffer' not in st.session_state: st.session_state.docx_buffer = None if 'pdf_bytes' not in st.session_state: st.session_state.pdf_bytes = None if 'filename' not in st.session_state: st.session_state.filename = "" if 'show_entities' not in st.session_state: st.session_state.show_entities = False if 'pdf_preview_buffer' not in st.session_state: st.session_state.pdf_preview_buffer = None def extract_text_from_pdf(pdf_bytes: bytes) -> str: """Extract text from PDF bytes.""" text = "" try: with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: for page in doc: text += page.get_text() except Exception as e: st.error(f"Error extracting text from PDF: {str(e)}") return text def extract_entities(text: str) -> Dict[str, List[str]]: """Extract entities from text using regex patterns.""" entities = { "Dates": [], "Email Addresses": [], "Phone Numbers": [], "URLs": [], "Monetary Values": [], "Names": [], "Organizations": [] } # Date patterns date_patterns = [ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b', r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b' ] # Email pattern email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' # Phone number patterns phone_patterns = [ r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', r'\b$\d{3}$\s*\d{3}[-.]?\d{4}\b', ] # URL pattern url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.-]*' # Monetary values pattern money_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\b\d+\s*(?:dollars|USD)\b' # Name pattern (simplified) name_pattern = r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b' # Organization patterns org_patterns = [ r'\b(?:Inc\.|LLC|Ltd\.|Corp\.|Company|Co\.|Corporation|Incorporated)\b', r'\b[A-Z][a-z]+\s+(?:Inc|LLC|Ltd|Corp|Co)\b' ] # Extract dates for pattern in date_patterns: entities["Dates"].extend(re.findall(pattern, text, re.IGNORECASE)) # Extract emails entities["Email Addresses"].extend(re.findall(email_pattern, text, re.IGNORECASE)) # Extract phone numbers for pattern in phone_patterns: entities["Phone Numbers"].extend(re.findall(pattern, text)) # Extract URLs entities["URLs"].extend(re.findall(url_pattern, text, re.IGNORECASE)) # Extract monetary values entities["Monetary Values"].extend(re.findall(money_pattern, text, re.IGNORECASE)) # Extract names entities["Names"].extend(re.findall(name_pattern, text)) # Extract organizations for pattern in org_patterns: entities["Organizations"].extend(re.findall(pattern, text, re.IGNORECASE)) # Remove duplicates and sort each entity list for key in entities: entities[key] = sorted(list(set(entities[key]))) return entities def create_word_document(entities: Dict[str, List[str]], filename: str) -> BytesIO: """Create a Word document from extracted entities.""" doc = docx.Document() # Add title title = doc.add_heading('Extracted Entities Report', 0) title.alignment = 1 # Center alignment # Add metadata doc.add_paragraph(f"Source Document: {filename}") doc.add_paragraph(f"Extraction Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") doc.add_paragraph("") # Add summary doc.add_heading('Summary', level=1) summary_table = doc.add_table(rows=1, cols=2) summary_table.style = 'LightShading-Accent1' hdr_cells = summary_table.rows[0].cells hdr_cells[0].text = 'Entity Type' hdr_cells[1].text = 'Count' total_entities = 0 for entity_type, values in entities.items(): if values: row_cells = summary_table.add_row().cells row_cells[0].text = entity_type row_cells[1].text = str(len(values)) total_entities += len(values) doc.add_paragraph(f"\nTotal Entities Extracted: {total_entities}") doc.add_paragraph("") # Add detailed entities doc.add_heading('Detailed Entities', level=1) for entity_type, values in entities.items(): if values: doc.add_heading(entity_type, level=2) for value in values: doc.add_paragraph(f"• {value}", style='ListBullet') doc.add_paragraph() # Save to bytes buffer buffer = BytesIO() doc.save(buffer) buffer.seek(0) return buffer def create_pdf_preview(entities: Dict[str, List[str]], filename: str) -> BytesIO: """Create a simple PDF preview using PyMuPDF.""" buffer = BytesIO() # Create a new PDF document doc = fitz.open() # Add a page page = doc.new_page() # Define margins and starting position margin = 50 x = margin y = margin line_height = 14 page_height = 800 # Title title = "Extracted Entities Preview" page.insert_text((x, y), title, fontsize=16, color=(0.12, 0.23, 0.54)) # #1E3A8A y += 40 # Metadata page.insert_text((x, y), f"Source: {filename}", fontsize=10) y += line_height * 1.5 page.insert_text((x, y), f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", fontsize=10) y += 30 # Summary section page.insert_text((x, y), "Summary", fontsize=12, color=(0.22, 0.26, 0.32)) # #374151 y += 25 # Draw summary table header page.draw_rect((x, y, x + 300, y + 25), fill=(0.23, 0.51, 0.96)) # #3B82F6 page.insert_text((x + 10, y + 18), "Entity Type", fontsize=10, color=(1, 1, 1)) page.insert_text((x + 210, y + 18), "Count", fontsize=10, color=(1, 1, 1)) y += 25 total_entities = 0 row_index = 0 # Add entity rows for entity_type, values in entities.items(): if values: # Alternate row colors if row_index % 2 == 0: fill_color = (0.95, 0.96, 0.97) # Light gray else: fill_color = (1, 1, 1) # White page.draw_rect((x, y, x + 300, y + 20), fill=fill_color) page.insert_text((x + 10, y + 13), entity_type, fontsize=10) page.insert_text((x + 210, y + 13), str(len(values)), fontsize=10) y += 20 total_entities += len(values) row_index += 1 # Add total row y += 5 page.draw_rect((x, y, x + 300, y + 25), fill=(0.95, 0.96, 0.97)) page.insert_text((x + 10, y + 18), "Total", fontsize=10, fontname="helv", color=(0, 0, 0)) page.insert_text((x + 210, y + 18), str(total_entities), fontsize=10, fontname="helv", color=(0, 0, 0)) y += 40 # Detailed entities section if y > page_height - 100: page = doc.new_page() x = margin y = margin page.insert_text((x, y), "Detailed Entities", fontsize=12, color=(0.22, 0.26, 0.32)) y += 30 # Add each entity type with values for entity_type, values in entities.items(): if values: if y > page_height - 50: page = doc.new_page() x = margin y = margin page.insert_text((x, y), f"{entity_type}:", fontsize=11, color=(0.12, 0.23, 0.54)) y += 20 for value in values: if y > page_height - 30: page = doc.new_page() x = margin y = margin page.insert_text((x + 20, y), f"• {value}", fontsize=10) y += line_height y += 10 # Save to buffer doc.save(buffer) doc.close() buffer.seek(0) return buffer def display_pdf_viewer(pdf_bytes: bytes, filename: str, preview: bool = False): """Display PDF in the viewer.""" try: # Convert PDF to images for display pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") # Display first page as preview page = pdf_document[0] pix = page.get_pixmap(dpi=150) # Convert to bytes for display img_bytes = pix.tobytes("png") # Display the PDF page as image st.image(img_bytes, use_container_width=True) # Show document info col1, col2 = st.columns(2) with col1: display_name = "Entities Preview.pdf" if preview else filename st.caption(f"**File:** {display_name}") with col2: st.caption(f"**Pages:** {len(pdf_document)}") pdf_document.close() except Exception as e: st.error(f"Error displaying PDF: {str(e)}") def fetch_pdf_from_url(url: str) -> Tuple[bool, bytes]: """Fetch PDF from URL.""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # Check if it's a PDF content_type = response.headers.get('content-type', '').lower() if 'pdf' in content_type or url.lower().endswith('.pdf'): return True, response.content else: st.warning("The URL does not appear to point to a PDF file.") return False, b"" except requests.exceptions.RequestException as e: st.error(f"Error fetching PDF: {str(e)}") return False, b"" def clear_session(): """Clear session state.""" st.session_state.extracted_text = "" st.session_state.entities = {} st.session_state.docx_buffer = None st.session_state.pdf_bytes = None st.session_state.filename = "" st.session_state.show_entities = False st.session_state.pdf_preview_buffer = None # Main application def main(): # Header st.markdown('

PDF Entity Extractor

', unsafe_allow_html=True) # Top section for URL input with aligned buttons st.markdown("### Fetch PDF from URL") # Create columns for aligned layout url_col1, url_col2, url_col3 = st.columns([4, 1, 1]) with url_col1: url_input = st.text_input( "Enter PDF URL", placeholder="https://example.com/document.pdf", label_visibility="collapsed", key="url_input" ) with url_col2: fetch_btn = st.button("Fetch", use_container_width=True, type="primary") with url_col3: if st.button("Clear All", use_container_width=True, type="secondary"): clear_session() st.rerun() # File upload section st.markdown("---") uploaded_file = st.file_uploader( "Upload PDF File", type=["pdf"], help="Select a PDF file from your device", label_visibility="collapsed" ) # Handle URL fetch if fetch_btn and url_input: with st.spinner("Fetching PDF document..."): success, pdf_bytes = fetch_pdf_from_url(url_input) if success: st.session_state.pdf_bytes = pdf_bytes st.session_state.filename = url_input.split("/")[-1] or "document.pdf" st.success("PDF document fetched successfully") st.session_state.show_entities = False # Handle file upload elif uploaded_file is not None: st.session_state.pdf_bytes = uploaded_file.getvalue() st.session_state.filename = uploaded_file.name st.session_state.show_entities = False # Process PDF if available if st.session_state.pdf_bytes: # Create two columns col1, col2 = st.columns([1, 1], gap="large") with col1: st.markdown('

PDF Document

', unsafe_allow_html=True) display_pdf_viewer(st.session_state.pdf_bytes, st.session_state.filename) # Download button for PDF st.download_button( label="Download PDF", data=st.session_state.pdf_bytes, file_name=st.session_state.filename, mime="application/pdf", use_container_width=True, key="pdf_download" ) with col2: st.markdown('

Entity Extraction

', unsafe_allow_html=True) # Extract entities button if st.button("Extract Entities", use_container_width=True, type="primary"): with st.spinner("Processing document..."): text = extract_text_from_pdf(st.session_state.pdf_bytes) if text.strip(): entities = extract_entities(text) st.session_state.extracted_text = text st.session_state.entities = entities # Create Word document docx_buffer = create_word_document(entities, st.session_state.filename) st.session_state.docx_buffer = docx_buffer # Create PDF preview pdf_preview_buffer = create_pdf_preview(entities, st.session_state.filename) st.session_state.pdf_preview_buffer = pdf_preview_buffer st.session_state.show_entities = True st.success(f"Extracted {sum(len(v) for v in entities.values())} entities") else: st.error("Could not extract text from PDF. The document may be scanned or contain only images.") # Display entities if available if st.session_state.show_entities and st.session_state.entities: if any(st.session_state.entities.values()): st.markdown("### Extracted Entities Preview") # Display PDF preview if st.session_state.pdf_preview_buffer: display_pdf_viewer(st.session_state.pdf_preview_buffer, "", preview=True) # Download Word document button if st.session_state.docx_buffer: st.markdown("### Download Report") doc_name = f"{st.session_state.filename.rsplit('.', 1)[0]}_entities_report.docx" st.download_button( label="Download Word Report", data=st.session_state.docx_buffer, file_name=doc_name, mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", use_container_width=True, key="docx_download" ) st.caption("Note: The preview above is a PDF. The download will be a Word document containing the complete report.") else: st.info("No entities found in the document.") else: st.info("Click 'Extract Entities' to begin entity extraction.") else: # Show instructions when no PDF is loaded st.markdown('

', unsafe_allow_html=True) st.markdown(""" ### Welcome to PDF Entity Extractor **To get started, you can either:** 1. **Enter a URL** - Provide a direct link to a PDF file above 2. **Upload a PDF** - Use the upload area below to select a file from your device **What this tool does:** - Extracts entities like dates, email addresses, phone numbers, URLs, monetary values, names, and organizations - Generates a comprehensive Word document report - Provides a PDF preview of extracted entities **Supported formats:** PDF files with extractable text """) st.markdown('

', unsafe_allow_html=True) if __name__ == "__main__": main()