Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from io import BytesIO | |
| import os | |
| from dotenv import load_dotenv | |
| from utils import ( | |
| configure_gemini, | |
| analyze_pdf_directly, | |
| csv_to_dataframe, | |
| save_csv, | |
| get_pdf_metadata, | |
| extract_csv_from_response, | |
| pdf_to_images, | |
| analyze_single_document, | |
| process_local_pdf | |
| ) | |
| import base64 | |
| from datetime import datetime | |
| import tempfile | |
| # Load environment variables | |
| load_dotenv() | |
| # Configure page settings | |
| st.set_page_config( | |
| page_title="PDF Document Analyzer", | |
| page_icon="π", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS styling | |
| st.markdown(""" | |
| <style> | |
| .document-card { | |
| border-radius: 10px; | |
| padding: 1.5rem; | |
| margin-bottom: 1.5rem; | |
| background-color: white; | |
| box-shadow: 0 4px 12px rgba(0,0,0,0.1); | |
| transition: transform 0.2s; | |
| } | |
| .document-card:hover { | |
| transform: translateY(-2px); | |
| } | |
| .stButton>button { | |
| background-color: #4285F4; | |
| color: white; | |
| border-radius: 8px; | |
| padding: 0.5rem 1.5rem; | |
| font-weight: 500; | |
| } | |
| .analysis-section { | |
| border-left: 4px solid #4285F4; | |
| padding-left: 1rem; | |
| margin-top: 1.5rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # App Header | |
| st.title("π PDF Document Analyzer") | |
| st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing") | |
| # Load prompt | |
| PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions: | |
| 1. *Identify Parties:* Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2. | |
| 2. *Identify Missing Items:* Locate and list all instances of missing signatures and missing initials for all parties across all documents. | |
| 3. *Identify Checked Boxes:* Locate and list all checkboxes that have been marked or checked. | |
| 4. *Generate Secondary Questions:* For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved. | |
| 5. *Check for Required Paperwork:* Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images. | |
| 6. *Identify Conflicts:* Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously). | |
| 7. *Provide Location:* For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block). | |
| 8. *Format Output:* Present all findings comprehensively in CSV format. The CSV columns should be: | |
| * Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict) | |
| * Location (Document Name/Page, e.g., Sale Contract Pg 2) | |
| * Line Item(s) (Approximate line number or location description) | |
| * Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document) | |
| * Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected) | |
| * Details (Specifics like names, text of the checkbox, description of the issue or document status) | |
| * Secondary Question (if applicable) (The question generated in step 4) | |
| Please apply this analysis to the entire set of documents provided. | |
| """ | |
| # Sidebar Configuration | |
| with st.sidebar: | |
| # st.header("Configuration") | |
| # api_key = st.text_input( | |
| # "Enter Gemini API Key:", | |
| # type="password", | |
| # value=os.getenv("GEMINI_API_KEY", "") | |
| # ) | |
| api_key = os.getenv("GEMINI_API_KEY", "") | |
| configure_gemini(api_key) | |
| st.markdown("---") | |
| st.info(""" | |
| **Features:** | |
| - PDF processing using images partitioned by page | |
| - Individual analysis for each document | |
| - Downloadable CSV reports | |
| """) | |
| # Main App Content | |
| uploaded_files = st.file_uploader( | |
| "Upload PDF Documents", | |
| type=["pdf"], | |
| accept_multiple_files=True, | |
| help="Upload multiple PDF documents for analysis" | |
| ) | |
| if uploaded_files and api_key: | |
| st.success(f"β {len(uploaded_files)} PDF(s) ready for analysis") | |
| # Process each PDF separately | |
| for i, uploaded_file in enumerate(uploaded_files): | |
| with st.container(): | |
| st.markdown(f"### π Document {i+1}: {uploaded_file.name}") | |
| # Display document info | |
| metadata = get_pdf_metadata(uploaded_file.getvalue()) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Pages", metadata['page_count']) | |
| with col2: | |
| st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB") | |
| with col3: | |
| if st.button(f"Analyze Document", key=f"analyze_{i}"): | |
| with st.spinner(f"Analyzing {uploaded_file.name}..."): | |
| try: | |
| # Analyze PDF directly | |
| # Convert PDF to images | |
| df = process_local_pdf(uploaded_file.getvalue()) | |
| # Display results in expandable section | |
| with st.expander("View Analysis Results", expanded=True): | |
| if not df.empty: | |
| st.dataframe(df) | |
| excel_buffer = BytesIO() | |
| df.to_excel(excel_buffer, index=False) | |
| excel_data = excel_buffer.getvalue() | |
| # Download button | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.xlsx" | |
| st.download_button( | |
| label="Download Analysis", | |
| data=excel_data, | |
| file_name=csv_filename, | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| key=f"download_{i}" | |
| ) | |
| else: | |
| st.warning("No CSV data found in response") | |
| st.markdown("### Full Response") | |
| st.write(raw_response) | |
| except Exception as e: | |
| st.error(f"Analysis failed: {str(e)}") | |
| st.markdown("---") | |
| elif not api_key: | |
| st.warning("β οΈ Please enter your Gemini API key in the sidebar to proceed") | |
| elif not uploaded_files: | |
| st.info("π€ Please upload PDF documents using the file uploader above") |