Spaces:

sandtemp01
/

extract_selva_ui

Sleeping

File size: 19,156 Bytes

#  streamlit run extraction_ui.py
import streamlit as st
import requests
import fitz  # PyMuPDF
import docx
from io import BytesIO
import re
from datetime import datetime
from typing import Dict, List, Tuple

# Set page configuration
st.set_page_config(
    page_title="PDF Entity Extractor",
    page_icon="📄",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# Custom CSS for professional styling
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        color: #1E3A8A;
        text-align: center;
        margin-bottom: 1rem;
        font-weight: 600;
    }
    .section-header {
        font-size: 1.5rem;
        color: #374151;
        margin-bottom: 1rem;
        font-weight: 600;
        border-bottom: 2px solid #E5E7EB;
        padding-bottom: 0.5rem;
    }
    .info-box {
        background-color: #F3F4F6;
        padding: 1.5rem;
        border-radius: 8px;
        border-left: 4px solid #3B82F6;
        margin-bottom: 1.5rem;
    }
    .success-box {
        background-color: #D1FAE5;
        padding: 1rem;
        border-radius: 6px;
        border-left: 4px solid #10B981;
        margin: 1rem 0;
    }
    .entity-card {
        background-color: white;
        border: 1px solid #E5E7EB;
        border-radius: 8px;
        padding: 1rem;
        margin-bottom: 1rem;
        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
    }
    .stButton button {
        background-color: #2563EB;
        color: white;
        border: none;
        padding: 0.5rem 1rem;
        border-radius: 6px;
        font-weight: 500;
        transition: background-color 0.3s;
        height: 44px;
        margin-top: 1.6rem;
    }
    .stButton button:hover {
        background-color: #1D4ED8;
    }
    .clear-btn button {
        background-color: #EF4444 !important;
        height: 44px;
        margin-top: 1.6rem;
    }
    .clear-btn button:hover {
        background-color: #DC2626 !important;
    }
    .download-btn {
        background-color: #10B981 !important;
    }
    .download-btn:hover {
        background-color: #0DA271 !important;
    }
    .stDownloadButton button {
        width: 100%;
    }
    .stExpander {
        border: 1px solid #E5E7EB;
        border-radius: 8px;
        margin-bottom: 0.5rem;
    }
    .entity-item {
        padding: 0.5rem 0;
        border-bottom: 1px solid #F3F4F6;
    }
    .entity-item:last-child {
        border-bottom: none;
    }
    .url-input-container {
        margin-bottom: 1rem;
    }
</style>
""", unsafe_allow_html=True)

# Initialize session state
if 'extracted_text' not in st.session_state:
    st.session_state.extracted_text = ""
if 'entities' not in st.session_state:
    st.session_state.entities = {}
if 'docx_buffer' not in st.session_state:
    st.session_state.docx_buffer = None
if 'pdf_bytes' not in st.session_state:
    st.session_state.pdf_bytes = None
if 'filename' not in st.session_state:
    st.session_state.filename = ""
if 'show_entities' not in st.session_state:
    st.session_state.show_entities = False
if 'pdf_preview_buffer' not in st.session_state:
    st.session_state.pdf_preview_buffer = None

def extract_text_from_pdf(pdf_bytes: bytes) -> str:
    """Extract text from PDF bytes."""
    text = ""
    try:
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        st.error(f"Error extracting text from PDF: {str(e)}")
    return text

def extract_entities(text: str) -> Dict[str, List[str]]:
    """Extract entities from text using regex patterns."""
    entities = {
        "Dates": [],
        "Email Addresses": [],
        "Phone Numbers": [],
        "URLs": [],
        "Monetary Values": [],
        "Names": [],
        "Organizations": []
    }
    
    # Date patterns
    date_patterns = [
        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',
        r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b',
        r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b'
    ]
    
    # Email pattern
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    
    # Phone number patterns
    phone_patterns = [
        r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
        r'\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b',
    ]
    
    # URL pattern
    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.-]*'
    
    # Monetary values pattern
    money_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\b\d+\s*(?:dollars|USD)\b'
    
    # Name pattern (simplified)
    name_pattern = r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.)\s+[A-Z][a-z]+\s+[A-Z][a-z]+\b'
    
    # Organization patterns
    org_patterns = [
        r'\b(?:Inc\.|LLC|Ltd\.|Corp\.|Company|Co\.|Corporation|Incorporated)\b',
        r'\b[A-Z][a-z]+\s+(?:Inc|LLC|Ltd|Corp|Co)\b'
    ]
    
    # Extract dates
    for pattern in date_patterns:
        entities["Dates"].extend(re.findall(pattern, text, re.IGNORECASE))
    
    # Extract emails
    entities["Email Addresses"].extend(re.findall(email_pattern, text, re.IGNORECASE))
    
    # Extract phone numbers
    for pattern in phone_patterns:
        entities["Phone Numbers"].extend(re.findall(pattern, text))
    
    # Extract URLs
    entities["URLs"].extend(re.findall(url_pattern, text, re.IGNORECASE))
    
    # Extract monetary values
    entities["Monetary Values"].extend(re.findall(money_pattern, text, re.IGNORECASE))
    
    # Extract names
    entities["Names"].extend(re.findall(name_pattern, text))
    
    # Extract organizations
    for pattern in org_patterns:
        entities["Organizations"].extend(re.findall(pattern, text, re.IGNORECASE))
    
    # Remove duplicates and sort each entity list
    for key in entities:
        entities[key] = sorted(list(set(entities[key])))
    
    return entities

def create_word_document(entities: Dict[str, List[str]], filename: str) -> BytesIO:
    """Create a Word document from extracted entities."""
    doc = docx.Document()
    
    # Add title
    title = doc.add_heading('Extracted Entities Report', 0)
    title.alignment = 1  # Center alignment
    
    # Add metadata
    doc.add_paragraph(f"Source Document: {filename}")
    doc.add_paragraph(f"Extraction Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    doc.add_paragraph("")
    
    # Add summary
    doc.add_heading('Summary', level=1)
    summary_table = doc.add_table(rows=1, cols=2)
    summary_table.style = 'LightShading-Accent1'
    hdr_cells = summary_table.rows[0].cells
    hdr_cells[0].text = 'Entity Type'
    hdr_cells[1].text = 'Count'
    
    total_entities = 0
    for entity_type, values in entities.items():
        if values:
            row_cells = summary_table.add_row().cells
            row_cells[0].text = entity_type
            row_cells[1].text = str(len(values))
            total_entities += len(values)
    
    doc.add_paragraph(f"\nTotal Entities Extracted: {total_entities}")
    doc.add_paragraph("")
    
    # Add detailed entities
    doc.add_heading('Detailed Entities', level=1)
    
    for entity_type, values in entities.items():
        if values:
            doc.add_heading(entity_type, level=2)
            for value in values:
                doc.add_paragraph(f"• {value}", style='ListBullet')
            doc.add_paragraph()
    
    # Save to bytes buffer
    buffer = BytesIO()
    doc.save(buffer)
    buffer.seek(0)
    return buffer

def create_pdf_preview(entities: Dict[str, List[str]], filename: str) -> BytesIO:
    """Create a simple PDF preview using PyMuPDF."""
    buffer = BytesIO()
    
    # Create a new PDF document
    doc = fitz.open()
    
    # Add a page
    page = doc.new_page()
    
    # Define margins and starting position
    margin = 50
    x = margin
    y = margin
    line_height = 14
    page_height = 800
    
    # Title
    title = "Extracted Entities Preview"
    page.insert_text((x, y), title, fontsize=16, color=(0.12, 0.23, 0.54))  # #1E3A8A
    y += 40
    
    # Metadata
    page.insert_text((x, y), f"Source: {filename}", fontsize=10)
    y += line_height * 1.5
    
    page.insert_text((x, y), f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", fontsize=10)
    y += 30
    
    # Summary section
    page.insert_text((x, y), "Summary", fontsize=12, color=(0.22, 0.26, 0.32))  # #374151
    y += 25
    
    # Draw summary table header
    page.draw_rect((x, y, x + 300, y + 25), fill=(0.23, 0.51, 0.96))  # #3B82F6
    page.insert_text((x + 10, y + 18), "Entity Type", fontsize=10, color=(1, 1, 1))
    page.insert_text((x + 210, y + 18), "Count", fontsize=10, color=(1, 1, 1))
    y += 25
    
    total_entities = 0
    row_index = 0
    
    # Add entity rows
    for entity_type, values in entities.items():
        if values:
            # Alternate row colors
            if row_index % 2 == 0:
                fill_color = (0.95, 0.96, 0.97)  # Light gray
            else:
                fill_color = (1, 1, 1)  # White
            
            page.draw_rect((x, y, x + 300, y + 20), fill=fill_color)
            page.insert_text((x + 10, y + 13), entity_type, fontsize=10)
            page.insert_text((x + 210, y + 13), str(len(values)), fontsize=10)
            y += 20
            total_entities += len(values)
            row_index += 1
    
    # Add total row
    y += 5
    page.draw_rect((x, y, x + 300, y + 25), fill=(0.95, 0.96, 0.97))
    page.insert_text((x + 10, y + 18), "Total", fontsize=10, fontname="helv", color=(0, 0, 0))
    page.insert_text((x + 210, y + 18), str(total_entities), fontsize=10, fontname="helv", color=(0, 0, 0))
    y += 40
    
    # Detailed entities section
    if y > page_height - 100:
        page = doc.new_page()
        x = margin
        y = margin
    
    page.insert_text((x, y), "Detailed Entities", fontsize=12, color=(0.22, 0.26, 0.32))
    y += 30
    
    # Add each entity type with values
    for entity_type, values in entities.items():
        if values:
            if y > page_height - 50:
                page = doc.new_page()
                x = margin
                y = margin
            
            page.insert_text((x, y), f"{entity_type}:", fontsize=11, color=(0.12, 0.23, 0.54))
            y += 20
            
            for value in values:
                if y > page_height - 30:
                    page = doc.new_page()
                    x = margin
                    y = margin
                
                page.insert_text((x + 20, y), f"• {value}", fontsize=10)
                y += line_height
            
            y += 10
    
    # Save to buffer
    doc.save(buffer)
    doc.close()
    buffer.seek(0)
    return buffer

def display_pdf_viewer(pdf_bytes: bytes, filename: str, preview: bool = False):
    """Display PDF in the viewer."""
    try:
        # Convert PDF to images for display
        pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
        
        # Display first page as preview
        page = pdf_document[0]
        pix = page.get_pixmap(dpi=150)
        
        # Convert to bytes for display
        img_bytes = pix.tobytes("png")
        
        # Display the PDF page as image
        st.image(img_bytes, use_container_width=True)
        
        # Show document info
        col1, col2 = st.columns(2)
        with col1:
            display_name = "Entities Preview.pdf" if preview else filename
            st.caption(f"**File:** {display_name}")
        with col2:
            st.caption(f"**Pages:** {len(pdf_document)}")
        
        pdf_document.close()
        
    except Exception as e:
        st.error(f"Error displaying PDF: {str(e)}")

def fetch_pdf_from_url(url: str) -> Tuple[bool, bytes]:
    """Fetch PDF from URL."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Check if it's a PDF
        content_type = response.headers.get('content-type', '').lower()
        if 'pdf' in content_type or url.lower().endswith('.pdf'):
            return True, response.content
        else:
            st.warning("The URL does not appear to point to a PDF file.")
            return False, b""
    except requests.exceptions.RequestException as e:
        st.error(f"Error fetching PDF: {str(e)}")
        return False, b""

def clear_session():
    """Clear session state."""
    st.session_state.extracted_text = ""
    st.session_state.entities = {}
    st.session_state.docx_buffer = None
    st.session_state.pdf_bytes = None
    st.session_state.filename = ""
    st.session_state.show_entities = False
    st.session_state.pdf_preview_buffer = None

# Main application
def main():
    # Header
    st.markdown('<div class="main-header">PDF Entity Extractor</div>', unsafe_allow_html=True)
    
    # Top section for URL input with aligned buttons
    st.markdown("### Fetch PDF from URL")
    
    # Create columns for aligned layout
    url_col1, url_col2, url_col3 = st.columns([4, 1, 1])
    
    with url_col1:
        url_input = st.text_input(
            "Enter PDF URL",
            placeholder="https://example.com/document.pdf",
            label_visibility="collapsed",
            key="url_input"
        )
    
    with url_col2:
        fetch_btn = st.button("Fetch", use_container_width=True, type="primary")
    
    with url_col3:
        if st.button("Clear All", use_container_width=True, type="secondary"):
            clear_session()
            st.rerun()
    
    # File upload section
    st.markdown("---")
    uploaded_file = st.file_uploader(
        "Upload PDF File",
        type=["pdf"],
        help="Select a PDF file from your device",
        label_visibility="collapsed"
    )
    
    # Handle URL fetch
    if fetch_btn and url_input:
        with st.spinner("Fetching PDF document..."):
            success, pdf_bytes = fetch_pdf_from_url(url_input)
            if success:
                st.session_state.pdf_bytes = pdf_bytes
                st.session_state.filename = url_input.split("/")[-1] or "document.pdf"
                st.success("PDF document fetched successfully")
                st.session_state.show_entities = False
    
    # Handle file upload
    elif uploaded_file is not None:
        st.session_state.pdf_bytes = uploaded_file.getvalue()
        st.session_state.filename = uploaded_file.name
        st.session_state.show_entities = False
    
    # Process PDF if available
    if st.session_state.pdf_bytes:
        # Create two columns
        col1, col2 = st.columns([1, 1], gap="large")
        
        with col1:
            st.markdown('<div class="section-header">PDF Document</div>', unsafe_allow_html=True)
            display_pdf_viewer(st.session_state.pdf_bytes, st.session_state.filename)
            
            # Download button for PDF
            st.download_button(
                label="Download PDF",
                data=st.session_state.pdf_bytes,
                file_name=st.session_state.filename,
                mime="application/pdf",
                use_container_width=True,
                key="pdf_download"
            )
        
        with col2:
            st.markdown('<div class="section-header">Entity Extraction</div>', unsafe_allow_html=True)
            
            # Extract entities button
            if st.button("Extract Entities", use_container_width=True, type="primary"):
                with st.spinner("Processing document..."):
                    text = extract_text_from_pdf(st.session_state.pdf_bytes)
                    if text.strip():
                        entities = extract_entities(text)
                        st.session_state.extracted_text = text
                        st.session_state.entities = entities
                        
                        # Create Word document
                        docx_buffer = create_word_document(entities, st.session_state.filename)
                        st.session_state.docx_buffer = docx_buffer
                        
                        # Create PDF preview
                        pdf_preview_buffer = create_pdf_preview(entities, st.session_state.filename)
                        st.session_state.pdf_preview_buffer = pdf_preview_buffer
                        
                        st.session_state.show_entities = True
                        st.success(f"Extracted {sum(len(v) for v in entities.values())} entities")
                    else:
                        st.error("Could not extract text from PDF. The document may be scanned or contain only images.")
            
            # Display entities if available
            if st.session_state.show_entities and st.session_state.entities:
                if any(st.session_state.entities.values()):
                    st.markdown("### Extracted Entities Preview")
                    
                    # Display PDF preview
                    if st.session_state.pdf_preview_buffer:
                        display_pdf_viewer(st.session_state.pdf_preview_buffer, "", preview=True)
                    
                    # Download Word document button
                    if st.session_state.docx_buffer:
                        st.markdown("### Download Report")
                        doc_name = f"{st.session_state.filename.rsplit('.', 1)[0]}_entities_report.docx"
                        
                        st.download_button(
                            label="Download Word Report",
                            data=st.session_state.docx_buffer,
                            file_name=doc_name,
                            mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                            use_container_width=True,
                            key="docx_download"
                        )
                        
                        st.caption("Note: The preview above is a PDF. The download will be a Word document containing the complete report.")
                
                else:
                    st.info("No entities found in the document.")
            else:
                st.info("Click 'Extract Entities' to begin entity extraction.")
    
    else:
        # Show instructions when no PDF is loaded
        st.markdown('<div class="info-box">', unsafe_allow_html=True)
        st.markdown("""
        ### Welcome to PDF Entity Extractor
        
        **To get started, you can either:**
        1. **Enter a URL** - Provide a direct link to a PDF file above
        2. **Upload a PDF** - Use the upload area below to select a file from your device
        
        **What this tool does:**
        - Extracts entities like dates, email addresses, phone numbers, URLs, monetary values, names, and organizations
        - Generates a comprehensive Word document report
        - Provides a PDF preview of extracted entities
        
        **Supported formats:** PDF files with extractable text
        """)
        st.markdown('</div>', unsafe_allow_html=True)

if __name__ == "__main__":
    main()