csv-generation-img

Sleeping

App Files Files Community

csv-generation-img / app.py

Nechba

Update app.py

41acda9 verified 8 months ago

raw

history blame

7.91 kB

	import streamlit as st
	import pandas as pd
	from io import BytesIO
	import os
	from dotenv import load_dotenv
	from utils import (
	configure_gemini,
	analyze_pdf_directly,
	csv_to_dataframe,
	save_csv,
	get_pdf_metadata,
	extract_csv_from_response,
	pdf_to_images,
	analyze_single_document,
	process_local_pdf
	)
	import base64
	from datetime import datetime
	import tempfile

	# Load environment variables
	load_dotenv()

	# Configure page settings
	st.set_page_config(
	page_title="PDF Document Analyzer",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS styling
	st.markdown("""
	<style>
	.document-card {
	border-radius: 10px;
	padding: 1.5rem;
	margin-bottom: 1.5rem;
	background-color: white;
	box-shadow: 0 4px 12px rgba(0,0,0,0.1);
	transition: transform 0.2s;
	}
	.document-card:hover {
	transform: translateY(-2px);
	}
	.stButton>button {
	background-color: #4285F4;
	color: white;
	border-radius: 8px;
	padding: 0.5rem 1.5rem;
	font-weight: 500;
	}
	.analysis-section {
	border-left: 4px solid #4285F4;
	padding-left: 1rem;
	margin-top: 1.5rem;
	}
	</style>
	""", unsafe_allow_html=True)

	# App Header
	st.title("📄 PDF Document Analyzer")
	st.markdown("Upload multiple PDFs to analyze each document directly using Gemini's native PDF processing")

	# Load prompt
	PROMPT ="""Please analyze the provided images of the real estate document set and perform the following actions:

	1. Identify Parties: Determine and list Seller 1, Seller 2 (if applicable), Buyer 1, and Buyer 2.
	2. Identify Missing Items: Locate and list all instances of missing signatures and missing initials for all parties across all documents.
	3. Identify Checked Boxes: Locate and list all checkboxes that have been marked or checked.
	4. Generate Secondary Questions: For checkboxes that indicate significant waivers (e.g., home warranty, inspection rights, lead paint assessment), specific conditions (e.g., cash sale, contingency status), potential conflicts, or reference other documents, formulate a relevant 'Secondary Question' designed to prompt confirmation or clarification from the user/parties involved.
	5. Check for Required Paperwork: Based only on the checkboxes identified in step 3 that explicitly state or strongly imply a specific addendum or disclosure document should be attached (e.g., "Lead Based Paint Disclosure Addendum attached", "See Counter Offer Addendum", "Seller's Disclosure...Addendum attached", "Retainer Addendum attached", etc.), check if a document matching that description appears to be present within the provided image set. Note whether this implied paperwork is 'Found', 'Missing', or 'Potentially Missing/Ambiguous' within the provided images.
	6. Identify Conflicts: Specifically look for and note any directly contradictory information or conflicting checked boxes (like the conflicting inspection clauses found previously).
	7. Provide Location: For every identified item (missing signature/initial, checked box, required paperwork status, party identification, conflict), specify the approximate line number(s) or clear location on the page (e.g., Bottom Right Initials, Seller Signature Block).
	8. Format Output: Present all findings comprehensively in CSV format. The CSV columns should be:
	* Category (e.g., Parties, Missing Item, Checked Box, Required Paperwork, Conflict)
	* Location (Document Name/Page, e.g., Sale Contract Pg 2)
	* Line Item(s) (Approximate line number or location description)
	* Item Type (e.g., Seller Initials, Home Warranty Waiver, Lead Paint Addendum Check, Lead Paint Addendum Document)
	* Status (e.g., Identified, Missing, Checked, Found, Potentially Missing, Conflict Detected)
	* Details (Specifics like names, text of the checkbox, description of the issue or document status)
	* Secondary Question (if applicable) (The question generated in step 4)

	Please apply this analysis to the entire set of documents provided.
	"""

	# Sidebar Configuration
	with st.sidebar:
	# st.header("Configuration")
	# api_key = st.text_input(
	# "Enter Gemini API Key:",
	# type="password",
	# value=os.getenv("GEMINI_API_KEY", "")
	# )

	api_key = os.getenv("GEMINI_API_KEY", "")
	configure_gemini(api_key)

	st.markdown("---")
	st.info("""
	Features:
	- PDF processing using images partitioned by page
	- Individual analysis for each document
	- Downloadable CSV reports
	""")

	# Main App Content
	uploaded_files = st.file_uploader(
	"Upload PDF Documents",
	type=["pdf"],
	accept_multiple_files=True,
	help="Upload multiple PDF documents for analysis"
	)

	if uploaded_files and api_key:
	st.success(f"✅ {len(uploaded_files)} PDF(s) ready for analysis")

	# Process each PDF separately
	for i, uploaded_file in enumerate(uploaded_files):
	with st.container():
	st.markdown(f"### 📑 Document {i+1}: {uploaded_file.name}")

	# Display document info
	metadata = get_pdf_metadata(uploaded_file.getvalue())
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Pages", metadata['page_count'])
	with col2:
	st.metric("Size", f"{len(uploaded_file.getvalue()) / 1024:.1f} KB")
	with col3:
	if st.button(f"Analyze Document", key=f"analyze_{i}"):
	with st.spinner(f"Analyzing {uploaded_file.name}..."):
	try:
	# Analyze PDF directly
	# Convert PDF to images

	df = process_local_pdf(uploaded_file.getvalue())

	# Display results in expandable section
	with st.expander("View Analysis Results", expanded=True):
	if not df.empty:
	st.dataframe(df)
	excel_buffer = BytesIO()
	df.to_excel(excel_buffer, index=False)
	excel_data = excel_buffer.getvalue()

	# Download button
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	csv_filename = f"{uploaded_file.name}_analysis_{timestamp}.xlsx"

	st.download_button(
	label="Download Analysis",
	data=excel_data,
	file_name=csv_filename,
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	key=f"download_{i}"
	)

	else:
	st.warning("No CSV data found in response")
	st.markdown("### Full Response")
	st.write(raw_response)

	except Exception as e:
	st.error(f"Analysis failed: {str(e)}")

	st.markdown("---")

	elif not api_key:
	st.warning("⚠️ Please enter your Gemini API key in the sidebar to proceed")

	elif not uploaded_files:
	st.info("📤 Please upload PDF documents using the file uploader above")