Spaces:
Sleeping
Sleeping
| # streamlit run extraction_ui.py | |
| import streamlit as st | |
| import requests | |
| import fitz # PyMuPDF | |
| import docx | |
| from io import BytesIO | |
| import re | |
| from datetime import datetime | |
| from typing import Dict, List, Tuple | |
| # Set page configuration | |
| st.set_page_config( | |
| page_title="PDF Entity Extractor", | |
| page_icon="📄", | |
| layout="wide", | |
| initial_sidebar_state="collapsed" | |
| ) | |
| # Custom CSS for professional styling | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| color: #1E3A8A; | |
| text-align: center; | |
| margin-bottom: 1rem; | |
| font-weight: 600; | |
| } | |
| .section-header { | |
| font-size: 1.5rem; | |
| color: #374151; | |
| margin-bottom: 1rem; | |
| font-weight: 600; | |
| border-bottom: 2px solid #E5E7EB; | |
| padding-bottom: 0.5rem; | |
| } | |
| .info-box { | |
| background-color: #F3F4F6; | |
| padding: 1.5rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #3B82F6; | |
| margin-bottom: 1.5rem; | |
| } | |
| .success-box { | |
| background-color: #D1FAE5; | |
| padding: 1rem; | |
| border-radius: 6px; | |
| border-left: 4px solid #10B981; | |
| margin: 1rem 0; | |
| } | |
| .entity-card { | |
| background-color: white; | |
| border: 1px solid #E5E7EB; | |
| border-radius: 8px; | |
| padding: 1rem; | |
| margin-bottom: 1rem; | |
| box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1); | |
| } | |
| .stButton button { | |
| background-color: #2563EB; | |
| color: white; | |
| border: none; | |
| padding: 0.5rem 1rem; | |
| border-radius: 6px; | |
| font-weight: 500; | |
| transition: background-color 0.3s; | |
| height: 44px; | |
| margin-top: 1.6rem; | |
| } | |
| .stButton button:hover { | |
| background-color: #1D4ED8; | |
| } | |
| .clear-btn button { | |
| background-color: #EF4444 !important; | |
| height: 44px; | |
| margin-top: 1.6rem; | |
| } | |
| .clear-btn button:hover { | |
| background-color: #DC2626 !important; | |
| } | |
| .download-btn { | |
| background-color: #10B981 !important; | |
| } | |
| .download-btn:hover { | |
| background-color: #0DA271 !important; | |
| } | |
| .stDownloadButton button { | |
| width: 100%; | |
| } | |
| .stExpander { | |
| border: 1px solid #E5E7EB; | |
| border-radius: 8px; | |
| margin-bottom: 0.5rem; | |
| } | |
| .entity-item { | |
| padding: 0.5rem 0; | |
| border-bottom: 1px solid #F3F4F6; | |
| } | |
| .entity-item:last-child { | |
| border-bottom: none; | |
| } | |
| .url-input-container { | |
| margin-bottom: 1rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialize session state | |
| if 'extracted_text' not in st.session_state: | |
| st.session_state.extracted_text = "" | |
| if 'entities' not in st.session_state: | |
| st.session_state.entities = {} | |
| if 'docx_buffer' not in st.session_state: | |
| st.session_state.docx_buffer = None | |
| if 'pdf_bytes' not in st.session_state: | |
| st.session_state.pdf_bytes = None | |
| if 'filename' not in st.session_state: | |
| st.session_state.filename = "" | |
| if 'show_entities' not in st.session_state: | |
| st.session_state.show_entities = False | |
| if 'pdf_preview_buffer' not in st.session_state: | |
| st.session_state.pdf_preview_buffer = None | |
| def extract_text_from_pdf(pdf_bytes: bytes) -> str: | |
| """Extract text from PDF bytes.""" | |
| text = "" | |
| try: | |
| with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: | |
| for page in doc: | |
| text += page.get_text() | |
| except Exception as e: | |
| st.error(f"Error extracting text from PDF: {str(e)}") | |
| return text | |
| def extract_entities(text: str) -> Dict[str, List[str]]: | |
| """Extract entities from text using regex patterns.""" | |
| entities = { | |
| "Dates": [], | |
| "Email Addresses": [], | |
| "Phone Numbers": [], | |
| "URLs": [], | |
| "Monetary Values": [], | |
| "Names": [], | |
| "Organizations": [] | |
| } | |
| # Date patterns | |
| date_patterns = [ | |
| r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', | |
| r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b', | |
| r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b' | |
| ] | |
| # Email pattern | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| # Phone number patterns | |
| phone_patterns = [ | |
| r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', | |
| r'\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b', | |
| ] | |
| # URL pattern | |
| url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.-]*' | |
| # Monetary values pattern | |
| money_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\b\d+\s*(?:dollars|USD)\b' | |
| # Name pattern (simplified) | |
| name_pattern = r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b' | |
| # Organization patterns | |
| org_patterns = [ | |
| r'\b(?:Inc\.|LLC|Ltd\.|Corp\.|Company|Co\.|Corporation|Incorporated)\b', | |
| r'\b[A-Z][a-z]+\s+(?:Inc|LLC|Ltd|Corp|Co)\b' | |
| ] | |
| # Extract dates | |
| for pattern in date_patterns: | |
| entities["Dates"].extend(re.findall(pattern, text, re.IGNORECASE)) | |
| # Extract emails | |
| entities["Email Addresses"].extend(re.findall(email_pattern, text, re.IGNORECASE)) | |
| # Extract phone numbers | |
| for pattern in phone_patterns: | |
| entities["Phone Numbers"].extend(re.findall(pattern, text)) | |
| # Extract URLs | |
| entities["URLs"].extend(re.findall(url_pattern, text, re.IGNORECASE)) | |
| # Extract monetary values | |
| entities["Monetary Values"].extend(re.findall(money_pattern, text, re.IGNORECASE)) | |
| # Extract names | |
| entities["Names"].extend(re.findall(name_pattern, text)) | |
| # Extract organizations | |
| for pattern in org_patterns: | |
| entities["Organizations"].extend(re.findall(pattern, text, re.IGNORECASE)) | |
| # Remove duplicates and sort each entity list | |
| for key in entities: | |
| entities[key] = sorted(list(set(entities[key]))) | |
| return entities | |
| def create_word_document(entities: Dict[str, List[str]], filename: str) -> BytesIO: | |
| """Create a Word document from extracted entities.""" | |
| doc = docx.Document() | |
| # Add title | |
| title = doc.add_heading('Extracted Entities Report', 0) | |
| title.alignment = 1 # Center alignment | |
| # Add metadata | |
| doc.add_paragraph(f"Source Document: {filename}") | |
| doc.add_paragraph(f"Extraction Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| doc.add_paragraph("") | |
| # Add summary | |
| doc.add_heading('Summary', level=1) | |
| summary_table = doc.add_table(rows=1, cols=2) | |
| summary_table.style = 'LightShading-Accent1' | |
| hdr_cells = summary_table.rows[0].cells | |
| hdr_cells[0].text = 'Entity Type' | |
| hdr_cells[1].text = 'Count' | |
| total_entities = 0 | |
| for entity_type, values in entities.items(): | |
| if values: | |
| row_cells = summary_table.add_row().cells | |
| row_cells[0].text = entity_type | |
| row_cells[1].text = str(len(values)) | |
| total_entities += len(values) | |
| doc.add_paragraph(f"\nTotal Entities Extracted: {total_entities}") | |
| doc.add_paragraph("") | |
| # Add detailed entities | |
| doc.add_heading('Detailed Entities', level=1) | |
| for entity_type, values in entities.items(): | |
| if values: | |
| doc.add_heading(entity_type, level=2) | |
| for value in values: | |
| doc.add_paragraph(f"• {value}", style='ListBullet') | |
| doc.add_paragraph() | |
| # Save to bytes buffer | |
| buffer = BytesIO() | |
| doc.save(buffer) | |
| buffer.seek(0) | |
| return buffer | |
| def create_pdf_preview(entities: Dict[str, List[str]], filename: str) -> BytesIO: | |
| """Create a simple PDF preview using PyMuPDF.""" | |
| buffer = BytesIO() | |
| # Create a new PDF document | |
| doc = fitz.open() | |
| # Add a page | |
| page = doc.new_page() | |
| # Define margins and starting position | |
| margin = 50 | |
| x = margin | |
| y = margin | |
| line_height = 14 | |
| page_height = 800 | |
| # Title | |
| title = "Extracted Entities Preview" | |
| page.insert_text((x, y), title, fontsize=16, color=(0.12, 0.23, 0.54)) # #1E3A8A | |
| y += 40 | |
| # Metadata | |
| page.insert_text((x, y), f"Source: {filename}", fontsize=10) | |
| y += line_height * 1.5 | |
| page.insert_text((x, y), f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", fontsize=10) | |
| y += 30 | |
| # Summary section | |
| page.insert_text((x, y), "Summary", fontsize=12, color=(0.22, 0.26, 0.32)) # #374151 | |
| y += 25 | |
| # Draw summary table header | |
| page.draw_rect((x, y, x + 300, y + 25), fill=(0.23, 0.51, 0.96)) # #3B82F6 | |
| page.insert_text((x + 10, y + 18), "Entity Type", fontsize=10, color=(1, 1, 1)) | |
| page.insert_text((x + 210, y + 18), "Count", fontsize=10, color=(1, 1, 1)) | |
| y += 25 | |
| total_entities = 0 | |
| row_index = 0 | |
| # Add entity rows | |
| for entity_type, values in entities.items(): | |
| if values: | |
| # Alternate row colors | |
| if row_index % 2 == 0: | |
| fill_color = (0.95, 0.96, 0.97) # Light gray | |
| else: | |
| fill_color = (1, 1, 1) # White | |
| page.draw_rect((x, y, x + 300, y + 20), fill=fill_color) | |
| page.insert_text((x + 10, y + 13), entity_type, fontsize=10) | |
| page.insert_text((x + 210, y + 13), str(len(values)), fontsize=10) | |
| y += 20 | |
| total_entities += len(values) | |
| row_index += 1 | |
| # Add total row | |
| y += 5 | |
| page.draw_rect((x, y, x + 300, y + 25), fill=(0.95, 0.96, 0.97)) | |
| page.insert_text((x + 10, y + 18), "Total", fontsize=10, fontname="helv", color=(0, 0, 0)) | |
| page.insert_text((x + 210, y + 18), str(total_entities), fontsize=10, fontname="helv", color=(0, 0, 0)) | |
| y += 40 | |
| # Detailed entities section | |
| if y > page_height - 100: | |
| page = doc.new_page() | |
| x = margin | |
| y = margin | |
| page.insert_text((x, y), "Detailed Entities", fontsize=12, color=(0.22, 0.26, 0.32)) | |
| y += 30 | |
| # Add each entity type with values | |
| for entity_type, values in entities.items(): | |
| if values: | |
| if y > page_height - 50: | |
| page = doc.new_page() | |
| x = margin | |
| y = margin | |
| page.insert_text((x, y), f"{entity_type}:", fontsize=11, color=(0.12, 0.23, 0.54)) | |
| y += 20 | |
| for value in values: | |
| if y > page_height - 30: | |
| page = doc.new_page() | |
| x = margin | |
| y = margin | |
| page.insert_text((x + 20, y), f"• {value}", fontsize=10) | |
| y += line_height | |
| y += 10 | |
| # Save to buffer | |
| doc.save(buffer) | |
| doc.close() | |
| buffer.seek(0) | |
| return buffer | |
| def display_pdf_viewer(pdf_bytes: bytes, filename: str, preview: bool = False): | |
| """Display PDF in the viewer.""" | |
| try: | |
| # Convert PDF to images for display | |
| pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| # Display first page as preview | |
| page = pdf_document[0] | |
| pix = page.get_pixmap(dpi=150) | |
| # Convert to bytes for display | |
| img_bytes = pix.tobytes("png") | |
| # Display the PDF page as image | |
| st.image(img_bytes, use_container_width=True) | |
| # Show document info | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| display_name = "Entities Preview.pdf" if preview else filename | |
| st.caption(f"**File:** {display_name}") | |
| with col2: | |
| st.caption(f"**Pages:** {len(pdf_document)}") | |
| pdf_document.close() | |
| except Exception as e: | |
| st.error(f"Error displaying PDF: {str(e)}") | |
| def fetch_pdf_from_url(url: str) -> Tuple[bool, bytes]: | |
| """Fetch PDF from URL.""" | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| # Check if it's a PDF | |
| content_type = response.headers.get('content-type', '').lower() | |
| if 'pdf' in content_type or url.lower().endswith('.pdf'): | |
| return True, response.content | |
| else: | |
| st.warning("The URL does not appear to point to a PDF file.") | |
| return False, b"" | |
| except requests.exceptions.RequestException as e: | |
| st.error(f"Error fetching PDF: {str(e)}") | |
| return False, b"" | |
| def clear_session(): | |
| """Clear session state.""" | |
| st.session_state.extracted_text = "" | |
| st.session_state.entities = {} | |
| st.session_state.docx_buffer = None | |
| st.session_state.pdf_bytes = None | |
| st.session_state.filename = "" | |
| st.session_state.show_entities = False | |
| st.session_state.pdf_preview_buffer = None | |
| # Main application | |
| def main(): | |
| # Header | |
| st.markdown('<div class="main-header">PDF Entity Extractor</div>', unsafe_allow_html=True) | |
| # Top section for URL input with aligned buttons | |
| st.markdown("### Fetch PDF from URL") | |
| # Create columns for aligned layout | |
| url_col1, url_col2, url_col3 = st.columns([4, 1, 1]) | |
| with url_col1: | |
| url_input = st.text_input( | |
| "Enter PDF URL", | |
| placeholder="https://example.com/document.pdf", | |
| label_visibility="collapsed", | |
| key="url_input" | |
| ) | |
| with url_col2: | |
| fetch_btn = st.button("Fetch", use_container_width=True, type="primary") | |
| with url_col3: | |
| if st.button("Clear All", use_container_width=True, type="secondary"): | |
| clear_session() | |
| st.rerun() | |
| # File upload section | |
| st.markdown("---") | |
| uploaded_file = st.file_uploader( | |
| "Upload PDF File", | |
| type=["pdf"], | |
| help="Select a PDF file from your device", | |
| label_visibility="collapsed" | |
| ) | |
| # Handle URL fetch | |
| if fetch_btn and url_input: | |
| with st.spinner("Fetching PDF document..."): | |
| success, pdf_bytes = fetch_pdf_from_url(url_input) | |
| if success: | |
| st.session_state.pdf_bytes = pdf_bytes | |
| st.session_state.filename = url_input.split("/")[-1] or "document.pdf" | |
| st.success("PDF document fetched successfully") | |
| st.session_state.show_entities = False | |
| # Handle file upload | |
| elif uploaded_file is not None: | |
| st.session_state.pdf_bytes = uploaded_file.getvalue() | |
| st.session_state.filename = uploaded_file.name | |
| st.session_state.show_entities = False | |
| # Process PDF if available | |
| if st.session_state.pdf_bytes: | |
| # Create two columns | |
| col1, col2 = st.columns([1, 1], gap="large") | |
| with col1: | |
| st.markdown('<div class="section-header">PDF Document</div>', unsafe_allow_html=True) | |
| display_pdf_viewer(st.session_state.pdf_bytes, st.session_state.filename) | |
| # Download button for PDF | |
| st.download_button( | |
| label="Download PDF", | |
| data=st.session_state.pdf_bytes, | |
| file_name=st.session_state.filename, | |
| mime="application/pdf", | |
| use_container_width=True, | |
| key="pdf_download" | |
| ) | |
| with col2: | |
| st.markdown('<div class="section-header">Entity Extraction</div>', unsafe_allow_html=True) | |
| # Extract entities button | |
| if st.button("Extract Entities", use_container_width=True, type="primary"): | |
| with st.spinner("Processing document..."): | |
| text = extract_text_from_pdf(st.session_state.pdf_bytes) | |
| if text.strip(): | |
| entities = extract_entities(text) | |
| st.session_state.extracted_text = text | |
| st.session_state.entities = entities | |
| # Create Word document | |
| docx_buffer = create_word_document(entities, st.session_state.filename) | |
| st.session_state.docx_buffer = docx_buffer | |
| # Create PDF preview | |
| pdf_preview_buffer = create_pdf_preview(entities, st.session_state.filename) | |
| st.session_state.pdf_preview_buffer = pdf_preview_buffer | |
| st.session_state.show_entities = True | |
| st.success(f"Extracted {sum(len(v) for v in entities.values())} entities") | |
| else: | |
| st.error("Could not extract text from PDF. The document may be scanned or contain only images.") | |
| # Display entities if available | |
| if st.session_state.show_entities and st.session_state.entities: | |
| if any(st.session_state.entities.values()): | |
| st.markdown("### Extracted Entities Preview") | |
| # Display PDF preview | |
| if st.session_state.pdf_preview_buffer: | |
| display_pdf_viewer(st.session_state.pdf_preview_buffer, "", preview=True) | |
| # Download Word document button | |
| if st.session_state.docx_buffer: | |
| st.markdown("### Download Report") | |
| doc_name = f"{st.session_state.filename.rsplit('.', 1)[0]}_entities_report.docx" | |
| st.download_button( | |
| label="Download Word Report", | |
| data=st.session_state.docx_buffer, | |
| file_name=doc_name, | |
| mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| use_container_width=True, | |
| key="docx_download" | |
| ) | |
| st.caption("Note: The preview above is a PDF. The download will be a Word document containing the complete report.") | |
| else: | |
| st.info("No entities found in the document.") | |
| else: | |
| st.info("Click 'Extract Entities' to begin entity extraction.") | |
| else: | |
| # Show instructions when no PDF is loaded | |
| st.markdown('<div class="info-box">', unsafe_allow_html=True) | |
| st.markdown(""" | |
| ### Welcome to PDF Entity Extractor | |
| **To get started, you can either:** | |
| 1. **Enter a URL** - Provide a direct link to a PDF file above | |
| 2. **Upload a PDF** - Use the upload area below to select a file from your device | |
| **What this tool does:** | |
| - Extracts entities like dates, email addresses, phone numbers, URLs, monetary values, names, and organizations | |
| - Generates a comprehensive Word document report | |
| - Provides a PDF preview of extracted entities | |
| **Supported formats:** PDF files with extractable text | |
| """) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() |