Documents-Manager

Sleeping

App Files Files Community

rairo commited on Jul 20, 2025

Commit

ec735f4

verified ·

1 Parent(s): 0f5777e

Update app.py

Browse files

Files changed (1) hide show

app.py +218 -691

app.py CHANGED Viewed

@@ -2,52 +2,54 @@ import re
 import json
 import os
 import time
-from datetime import datetime, date, timedelta
 from io import BytesIO
-import requests
 import pandas as pd
 import streamlit as st
 import google.generativeai as genai
 import pypdf
 from fpdf import FPDF
-from fpdf.enums import XPos, YPos
-import markdown
 from google.api_core import exceptions
-from html_to_markdown import convert_to_markdown
-# Configure API key for Gemini
 api_key = os.getenv('Gemini')
 def configure_gemini(api_key):
-    st.info("Configuring Gemini API for transaction extraction...") # Log
     genai.configure(api_key=api_key)
     return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def configure_gemini1(api_key):
-    st.info("Configuring Gemini API for report generation...") # Log
     genai.configure(api_key=api_key)
     return genai.GenerativeModel('gemini-2.5-pro')
-# Read PDF content page by page from a file-like object
 def read_pdf_pages(file_obj):
-    st.info(f"Reading PDF pages from {file_obj.name}...") # Log
-    file_obj.seek(0)  # Ensure the file pointer is at the start
     pdf_reader = pypdf.PdfReader(file_obj)
     total_pages = len(pdf_reader.pages)
-    st.info(f"Found {total_pages} pages in PDF.") # Log
     return pdf_reader, total_pages
-# Extract text from a specific page
 def extract_page_text(pdf_reader, page_num):
-    # st.debug(f"Extracting text from page {page_num + 1}...") # Too verbose for general logging
     if page_num < len(pdf_reader.pages):
         text = pdf_reader.pages[page_num].extract_text()
-        if not text.strip():
-            st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.") # Log empty pages
         return text if text else ""
     return ""
-# Process a chunk of PDF text with Gemini to extract transactions as JSON
 def process_with_gemini(model, text):
     prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
     - Date (format DD/MM/YYYY)
@@ -74,422 +76,166 @@ def process_with_gemini(model, text):
         ]
     }"""
     try:
-        # st.debug("Sending text chunk to Gemini for transaction extraction...") # Too verbose
         response = model.generate_content([prompt, text])
-        time.sleep(6)  # Sleep for 6 seconds to work around rate limit
-        # st.debug("Received response from Gemini for transaction extraction.") # Too verbose
         return response.text
-    except exceptions.ServiceUnavailable as e:
-        if e.response.status_code == 504:
-            st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
-            return None
-        else:
-            st.error(f"Gemini API error during transaction extraction: {e}") # Log other API errors
-            raise
     except Exception as e:
-        st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}") # Catch other potential errors
         return None
-# Process PDF page by page to handle large files
 def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
     all_transactions = []
-    st.info(f"Starting page-by-page PDF processing for {total_pages} pages...") # Log
-    # Process pages individually or in small chunks
     for page_num in range(total_pages):
-        # Update progress if callback provided
         if progress_callback:
             progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
-        # Extract text from current page
         page_text = extract_page_text(pdf_reader, page_num)
         if not page_text.strip():
-            st.warning(f"Skipping empty or unreadable page {page_num + 1}.") # Log skipped pages
-            continue  # Skip empty pages
-        # Process the page with Gemini
-        st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...") # Log
         json_response = process_with_gemini(model, page_text)
         if json_response:
-            # Extract JSON from response
-            start_idx = json_response.find('{')
-            end_idx = json_response.rfind('}') + 1
-            if start_idx == -1 or end_idx == 0:
-                st.warning(f"No valid JSON found in Gemini response for page {page_num + 1}. Raw response: {json_response[:200]}...") # Log invalid JSON structure
-                continue  # Skip invalid JSON
-            json_str = json_response[start_idx:end_idx]
-            json_str = json_str.replace('```json', '').replace('```', '')
             try:
                 data = json.loads(json_str)
                 transactions = data.get('transactions', [])
                 if transactions:
-                    st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.") # Log successful extraction
                     all_transactions.extend(transactions)
-                else:
-                    st.info(f"No transactions found on page {page_num + 1} based on Gemini's analysis.") # Log no transactions found on page
             except json.JSONDecodeError:
-                st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}. Check response format. Raw JSON snippet: {json_str[:200]}...") # Log JSON decode errors
-                continue  # Skip invalid JSON
         else:
-            st.warning(f"Gemini returned no response for page {page_num + 1}. This page's transactions might be missing.") # Log no response from Gemini
-    st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.") # Final log for extraction
     return all_transactions
-# Generate financial report from aggregated JSON transactions and chosen parameters
-def generate_financial_report(model, json_data, start_date, end_date, statement_type):
-    st.info(f"Preparing prompt for Gemini to generate {statement_type} report from {start_date} to {end_date}...") # Log
-    prompt = f"""Based on the following transactions JSON data:
-{json.dumps(json_data)}
-Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
-Specific Formatting and Content Requirements:
-Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
-Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
-Consistent Formatting: Maintain consistent formatting for monetary values (e.g., using "R" for South African Rand if applicable and discernible from the data, comma separators for thousands), dates, and alignment.
-Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate to provide a clear understanding of the financial performance or position.
-Descriptive Line Items: Use clear and concise descriptions for each transaction or aggregated account based on the provided JSON data.
-Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends, notable figures, or key performance indicators derived from the data within the statement. This should be written in plain, understandable English, potentially highlighting aspects particularly relevant to the economic context of Zimbabwe if discernible from the data.
-Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture presented in the {statement_type}.
-Format the report in Markdown for better visual structure.
-Do not name the company if name is not there and return just the report and nothing else."""
-    try:
-        st.info("Sending request to Gemini for financial report generation...") # Log
-        response = model.generate_content([prompt])
-        time.sleep(7)  # Sleep for 7 seconds to work around rate limit
-        st.success("Successfully received financial report from Gemini.") # Log success
-        return response.text
-    except exceptions.ServiceUnavailable as e:
-        if e.response.status_code == 504:
-            st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
-            st.session_state['last_error'] = "504" # Store the error in session state
-            return None
-        else:
-            st.error(f"Gemini API error during report generation: {e}") # Log other API errors
-            raise
-    except Exception as e:
-        st.error(f"An unexpected error occurred during Gemini report generation: {e}") # Catch other potential errors
         return None
-def chunk_transactions(transactions, batch_size=400):
-    """Split transactions into smaller batches for processing."""
-    batches = []
-    for i in range(0, len(transactions), batch_size):
-        batch = transactions[i:i + batch_size]
-        batches.append(batch)
-    st.info(f"Split {len(transactions)} transactions into {len(batches)} batches of up to {batch_size} transactions each.")
-    return batches
-def generate_batch_summary(model, json_data, start_date, end_date, statement_type, batch_num, total_batches):
-    """Generate a summary analysis for a batch of transactions."""
-    st.info(f"Processing batch {batch_num}/{total_batches} with {len(json_data['transactions'])} transactions...")
-    prompt = f"""Analyze this batch of transactions (batch {batch_num} of {total_batches}) for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}.
-Transaction data:
-{json.dumps(json_data)}
-Create a structured summary focusing on aggregation and categorization. Return ONLY the following JSON structure:
-{{
-    "batch_info": {{
-        "batch_number": {batch_num},
-        "total_batches": {total_batches},
-        "transaction_count": {len(json_data['transactions'])},
-        "date_range": "{start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}"
-    }},
-    "financial_summary": {{
-        "total_income": 0,
-        "total_expenses": 0,
-        "net_position": 0
-    }},
-    "income_breakdown": {{
-        "by_customer": {{}},
-        "by_month": {{}}
-    }},
-    "expense_breakdown": {{
-        "by_category": {{}},
-        "by_month": {{}}
-    }},
-    "key_transactions": [
-        // Top 5 largest transactions (income and expense)
-    ],
-    "monthly_totals": {{
-        // Format: "YYYY-MM": {{"income": 0, "expenses": 0, "net": 0}}
-    }}
-}}
-Focus on numerical aggregation and categorization. Be precise with calculations."""
-    try:
-        response = model.generate_content([prompt])
-        time.sleep(4)
-        return response.text
-    except exceptions.ServiceUnavailable as e:
-        if e.response.status_code == 504:
-            st.error(f"Batch {batch_num} timed out. Skipping this batch.")
-            return None
-        else:
-            st.error(f"API error processing batch {batch_num}: {e}")
-            raise
-    except Exception as e:
-        st.error(f"Error processing batch {batch_num}: {e}")
-        return None
-def consolidate_batch_summaries(batch_summaries, start_date, end_date, statement_type):
-    """Combine multiple batch summaries into aggregated data structure."""
-    st.info(f"Consolidating {len(batch_summaries)} batch summaries...")
-    consolidated = {
-        "total_batches": len(batch_summaries),
-        "total_transactions": 0,
-        "date_range": f"{start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}",
-        "financial_summary": {
-            "total_income": 0,
-            "total_expenses": 0,
-            "net_position": 0
-        },
-        "income_breakdown": {
-            "by_customer": {},
-            "by_month": {}
-        },
-        "expense_breakdown": {
-            "by_category": {},
-            "by_month": {}
-        },
-        "key_transactions": [],
-        "monthly_totals": {}
     }
-    # Process each batch summary
-    for batch_data in batch_summaries:
-        if not batch_data:
-            continue
-        try:
-            # Extract JSON from response if needed
-            if isinstance(batch_data, str):
-                start_idx = batch_data.find('{')
-                end_idx = batch_data.rfind('}') + 1
-                if start_idx != -1 and end_idx > start_idx:
-                    json_str = batch_data[start_idx:end_idx]
-                    batch_data = json.loads(json_str)
-                else:
-                    st.warning("Could not extract JSON from batch summary")
-                    continue
-            # Aggregate financial summary
-            if 'financial_summary' in batch_data:
-                fs = batch_data['financial_summary']
-                consolidated['financial_summary']['total_income'] += fs.get('total_income', 0)
-                consolidated['financial_summary']['total_expenses'] += fs.get('total_expenses', 0)
-            # Aggregate transaction count
-            if 'batch_info' in batch_data:
-                consolidated['total_transactions'] += batch_data['batch_info'].get('transaction_count', 0)
-            # Merge income breakdown by customer
-            if 'income_breakdown' in batch_data:
-                for customer, amount in batch_data['income_breakdown'].get('by_customer', {}).items():
-                    consolidated['income_breakdown']['by_customer'][customer] = \
-                        consolidated['income_breakdown']['by_customer'].get(customer, 0) + amount
-                # Merge income by month
-                for month, amount in batch_data['income_breakdown'].get('by_month', {}).items():
-                    consolidated['income_breakdown']['by_month'][month] = \
-                        consolidated['income_breakdown']['by_month'].get(month, 0) + amount
-            # Merge expense breakdown by category
-            if 'expense_breakdown' in batch_data:
-                for category, amount in batch_data['expense_breakdown'].get('by_category', {}).items():
-                    consolidated['expense_breakdown']['by_category'][category] = \
-                        consolidated['expense_breakdown']['by_category'].get(category, 0) + amount
-                # Merge expenses by month
-                for month, amount in batch_data['expense_breakdown'].get('by_month', {}).items():
-                    consolidated['expense_breakdown']['by_month'][month] = \
-                        consolidated['expense_breakdown']['by_month'].get(month, 0) + amount
-            # Collect key transactions
-            if 'key_transactions' in batch_data:
-                consolidated['key_transactions'].extend(batch_data.get('key_transactions', []))
-            # Merge monthly totals
-            if 'monthly_totals' in batch_data:
-                for month, totals in batch_data['monthly_totals'].items():
-                    if month not in consolidated['monthly_totals']:
-                        consolidated['monthly_totals'][month] = {"income": 0, "expenses": 0, "net": 0}
-                    consolidated['monthly_totals'][month]['income'] += totals.get('income', 0)
-                    consolidated['monthly_totals'][month]['expenses'] += totals.get('expenses', 0)
-                    consolidated['monthly_totals'][month]['net'] += totals.get('net', 0)
-        except json.JSONDecodeError as e:
-            st.warning(f"Could not parse batch summary JSON: {e}")
-            continue
-        except Exception as e:
-            st.warning(f"Error processing batch summary: {e}")
-            continue
-    # Calculate final net position
-    consolidated['financial_summary']['net_position'] = \
-        consolidated['financial_summary']['total_income'] - consolidated['financial_summary']['total_expenses']
-    st.success(f"Successfully consolidated data from {len(batch_summaries)} batches covering {consolidated['total_transactions']} transactions.")
-    return consolidated
-def generate_final_report(model, consolidated_data, statement_type):
-    """Generate the final comprehensive report using consolidated batch data."""
-    st.info("Generating final comprehensive report from consolidated data...")
-    prompt = f"""Using this consolidated financial data, generate a comprehensive {statement_type} report:
-Consolidated Data:
-{json.dumps(consolidated_data, indent=2)}
-Generate a detailed {statement_type} report with the following requirements:
-1. **Professional Format**: Use standard South African accounting format and terminology
-2. **Clear Structure**: Organize with proper headings, subheadings, and sections
-3. **Comprehensive Analysis**: Include:
-   - Executive Summary
-   - Detailed breakdown by categories/customers
-   - Monthly trend analysis
-   - Key performance indicators
-   - Notable transactions and patterns
-4. **Visual Elements**: Use tables, proper formatting for better readability
-5. **Insights**: Provide meaningful business insights based on the data
-6. **Currency**: Use "R" for South African Rand where appropriate
-Return the report in well-formatted Markdown. Do not include company name if not available.
-Focus on creating a professional, comprehensive financial statement that provides clear insights into the business performance."""
     try:
         response = model.generate_content([prompt])
-        time.sleep(6)
-        st.success("Final comprehensive report generated successfully!")
         return response.text
-    except exceptions.ServiceUnavailable as e:
-        if e.response.status_code == 504:
-            st.error("Final report generation timed out. The consolidated data might be too large.")
-            return None
-        else:
-            st.error(f"API error generating final report: {e}")
-            raise
-    except Exception as e:
-        st.error(f"Error generating final report: {e}")
         return None
-def generate_batched_financial_report(model, filtered_transactions, start_date, end_date, statement_type, batch_size=400):
-    """Main function to generate financial report using batch processing."""
-    st.info(f"Starting batched financial report generation for {len(filtered_transactions)} transactions...")
-    # Step 1: Split transactions into batches
-    transaction_batches = chunk_transactions(filtered_transactions, batch_size)
-    # Step 2: Process each batch
-    batch_summaries = []
-    progress_bar = st.progress(0)
-    status_text = st.empty()
-    for i, batch in enumerate(transaction_batches):
-        progress = (i + 1) / len(transaction_batches)
-        progress_bar.progress(progress)
-        status_text.text(f"Processing batch {i + 1} of {len(transaction_batches)}...")
-        batch_json = {"transactions": batch}
-        summary = generate_batch_summary(model, batch_json, start_date, end_date, statement_type, i + 1, len(transaction_batches))
-        if summary:
-            batch_summaries.append(summary)
-    progress_bar.progress(1.0)
-    status_text.text("All batches processed!")
-    if not batch_summaries:
-        st.error("No batch summaries were successfully generated.")
         return None
-    # Step 3: Consolidate batch summaries
-    consolidated_data = consolidate_batch_summaries(batch_summaries, start_date, end_date, statement_type)
-    # Step 4: Generate final comprehensive report
-    final_report = generate_final_report(model, consolidated_data, statement_type)
-    return final_report
-# Install required libraries:
-# pip install fpdf2 beautifulsoup4 markdown
-from bs4 import BeautifulSoup
-# For logging errors/info
 class PDF_Generator(FPDF):
-    """
-    FPDF subclass to potentially add headers/footers later if needed.
-    Currently just a basic FPDF wrapper.
-    """
-    def __init__(self, orientation='P', unit='mm', format='A4'):
-        super().__init__(orientation, unit, format)
-        self.set_auto_page_break(auto=True, margin=15) # Enable auto page break
-        self.set_left_margin(15)
-        self.set_right_margin(15)
-        self.alias_nb_pages() # Allows for page numbering {nb}
-    # Example: Add a simple footer
-    # def footer(self):
-    #     self.set_y(-15) # Position 1.5 cm from bottom
-    #     self.set_font('helvetica', 'I', 8)
-    #     self.cell(0, 10, f'Page {self.page_no()}/{{nb}}', 0, 0, 'C')
     def add_html_element(self, tag, styles):
-        """ Processes a single HTML tag """
         text = tag.get_text()
         tag_name = tag.name.lower()
-        # --- Basic Styling ---
         current_style = ''
-        if 'b' in styles or 'strong' in styles:
-            current_style += 'B'
-        if 'i' in styles or 'em' in styles:
-            current_style += 'I'
-        # Reset font to default if no style
-        if not current_style:
-             self.set_font('helvetica', '', self.font_size_pt) # Reset to regular if needed
-        # --- Handle Specific Tags ---
         if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
             level = int(tag_name[1])
             font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
-            self.set_font('helvetica', 'B', font_size) # Headings usually bold
-            self.multi_cell(0, font_size * 0.5, text, align='L') # Use approx line height
-            self.ln(font_size * 0.3) # Space after heading
-            self.set_font('helvetica', '', 10) # Reset font size
         elif tag_name == 'p':
             self.set_font('helvetica', current_style, 10)
-            self.multi_cell(0, 5, text, align='L') # 5mm line height
-            self.ln(3) # Space after paragraph
         elif tag_name == 'ul':
             self.ln(2)
             for item in tag.find_all('li', recursive=False):
-                self.set_font('helvetica', '', 10) # Reset font for list item text
-                item_text = item.get_text()
-                self.cell(5, 5, chr(127)) # Bullet point (using a circle character)
-                self.multi_cell(0, 5, item_text, align='L') # Remaining width
-                self.ln(1) # Small space between items
-            self.ln(3)
-        elif tag_name == 'ol':
-            self.ln(2)
-            for i, item in enumerate(tag.find_all('li', recursive=False), 1):
-                self.set_font('helvetica', '', 10) # Reset font for list item text
                 item_text = item.get_text()
-                self.cell(8, 5, f"{i}.") # Numbered item
                 self.multi_cell(0, 5, item_text, align='L')
                 self.ln(1)
             self.ln(3)
@@ -497,387 +243,168 @@ class PDF_Generator(FPDF):
             self.ln(5)
             self.process_table(tag)
             self.ln(5)
-        elif tag_name in ['b', 'strong', 'i', 'em']:
-             # Handled by style tracking within parent elements for now
-             # Direct rendering might be needed for nested styles
-             pass # Style is applied by parent
-        elif tag_name == 'br':
-            self.ln(5) # Treat <br> as a line break
         elif tag_name == 'hr':
             self.ln(2)
             self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
             self.ln(4)
         else:
-            # Fallback for unknown tags - just print text content
-            if text.strip(): # Only print if there's actual text
                 self.set_font('helvetica', current_style, 10)
                 self.multi_cell(0, 5, text, align='L')
                 self.ln(1)
     def process_table(self, table_tag):
-        """ Rudimentary table processing """
         rows = table_tag.find_all('tr')
-        if not rows:
-            return
-        # --- Determine number of columns (use first row) ---
         header_cells = rows[0].find_all(['th', 'td'])
         num_cols = len(header_cells)
-        if num_cols == 0:
-            return
-        # --- Calculate column widths (simple equal distribution) ---
-        # Effective page width = Page width - left margin - right margin
         effective_width = self.w - self.l_margin - self.r_margin
         col_width = effective_width / num_cols
-        default_cell_height = 6 # Adjust as needed
-        # --- Process Header Row ---
         is_first_row = True
         for row in rows:
             cells = row.find_all(['th', 'td'])
-            if len(cells) != num_cols:
-                 st.warning(f"Table row has inconsistent number of cells ({len(cells)} vs {num_cols}). Skipping row.")
-                 continue # Skip rows with wrong number of cells
-            # Check page break possibility before drawing row
-            max_cell_h = default_cell_height # Start with default
-            # Estimate height needed (very basic, doesn't account for actual wrap height)
-            for cell in cells:
-                 # This is a rough estimate, multi_cell calculates real height
-                 pass # Cannot easily pre-calculate multi_cell height
-            # If estimated height exceeds remaining page space, add page
-            # Note: FPDF's auto page break handles this better with multi_cell
-            # if self.get_y() + max_cell_h > self.page_break_trigger:
-            #      self.add_page()
             is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
             for i, cell in enumerate(cells):
                 cell_text = cell.get_text().strip()
                 if is_header_row:
-                    self.set_font('helvetica', 'B', 9) # Bold header text
-                    self.set_fill_color(230, 230, 230) # Light grey fill
                     fill = True
                 else:
-                    self.set_font('helvetica', '', 9) # Regular text
-                    self.set_fill_color(255, 255, 255) # No fill (or alternate row color)
-                    fill = False # Or implement zebra striping
-                # Use multi_cell for text wrapping. Draw border '1'. Align 'L'.
-                # multi_cell automatically handles height based on content.
-                self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, ln=3) # ln=3 moves to beginning of next cell
-            self.ln(default_cell_height) # Move down after the row is complete (based on default height, multi_cell might make row taller)
-            is_first_row = False # Header only applies to the first row potentially
 def create_pdf_report(report_text):
-    """
-    Creates a PDF from markdown text locally using FPDF2.
-    Args:
-        report_text (str): Markdown formatted report text.
-    Returns:
-        BytesIO: PDF file in memory buffer.
-    Raises:
-        Exception: If PDF generation fails.
-    """
     if not report_text:
-        st.warning("Report text is empty, skipping PDF generation.") # Log
         raise ValueError("Input report_text cannot be empty.")
     try:
-        st.info("Starting PDF generation from markdown report...") # Log
-        # 1. Clean Markdown
-        cleaned_md = re.sub(r'^```markdown\s*', '', report_text, flags=re.MULTILINE)
-        cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
-        cleaned_md = cleaned_md.strip()
-        # st.debug("Markdown cleaned.") # Too verbose
-        # 2. Convert Markdown to HTML
-        html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
-        if not html_content:
-            st.error("Markdown parsing resulted in empty HTML.") # Log
-            raise ValueError("Markdown parsing resulted in empty HTML.")
-        # st.debug("Markdown converted to HTML.") # Too verbose
-        # 3. Parse HTML with BeautifulSoup
         soup = BeautifulSoup(html_content, 'html.parser')
-        # st.debug("HTML parsed with BeautifulSoup.") # Too verbose
-        # 4. Generate PDF using FPDF
         pdf = PDF_Generator()
         pdf.add_page()
-        pdf.set_font('helvetica', '', 10) # Default font
-        st.info("PDF document initialized, adding content...") # Log
-        # Iterate through top-level tags in the HTML body
         for element in soup.find_all(recursive=False):
-             styles = set()
-             def traverse(tag, current_styles):
-                 local_style_added = None
-                 if tag.name in ['b', 'strong']:
-                     current_styles.add('b')
-                     local_style_added = 'b'
-                 elif tag.name in ['i', 'em']:
-                     current_styles.add('i')
-                     local_style_added = 'i'
-                 if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
-                      pdf.add_html_element(tag, current_styles.copy())
-                 else:
-                     if hasattr(tag, 'contents'):
-                         for child in tag.contents:
-                             if isinstance(child, str):
-                                 pass
-                             elif hasattr(child, 'name'):
-                                 traverse(child, current_styles.copy())
-                 if local_style_added and local_style_added in current_styles:
-                     current_styles.remove(local_style_added)
-             traverse(element, styles)
-        st.info("Content added to PDF. Outputting PDF to buffer...") # Log
-        # 5. Output PDF to BytesIO buffer
-        pdf_output = pdf.output(dest='S') # Output as bytes string
-        if isinstance(pdf_output, str):
-             # If output is string (older fpdf versions?), encode it
-             pdf_output = pdf_output.encode('latin-1')
-        st.success("PDF report generated successfully.") # Log success
         return BytesIO(pdf_output)
-    except ImportError:
-        st.error("FPDF2, BeautifulSoup4 or Markdown library not installed. Please install using: pip install fpdf2 beautifulsoup4 markdown")
-        raise Exception("Missing required libraries: FPDF2, BeautifulSoup4, Markdown") from None
     except Exception as e:
-        st.error(f"Failed to generate PDF locally using FPDF: {type(e).__name__}: {e}")
-        st.exception(e) # Show traceback in streamlit logs
-        raise Exception(f"Local FPDF PDF generation failed: {e}") from e
 def main():
     st.title("Quantitlytix AI")
     st.markdown("*Bank Statement Parser & Financial Report Generator*")
-    st.info("Application started. Ready for user input.") # Log app start
-    # Initialize session state for last error
-    if 'last_error' not in st.session_state:
-        st.session_state['last_error'] = None
-    # Initialize session state for transaction date range
     if 'min_date' not in st.session_state:
-        st.session_state['min_date'] = date(2024, 1, 1)  # Default min date
     if 'max_date' not in st.session_state:
-        st.session_state['max_date'] = date(2024, 12, 31)  # Default max date
-    # Sidebar: Select input type: Bulk PDF or CSV Upload
     input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
-    st.info(f"Input type selected: {input_type}") # Log input type
-    all_transactions = []
     if input_type == "Bulk Bank Statement Upload":
         uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
         if uploaded_files:
-            st.info(f"User uploaded {len(uploaded_files)} PDF file(s).") # Log file upload
-            total_files = len(uploaded_files)
-            st.write(f"{total_files} PDF file(s) uploaded.")
-            try:
-                model = configure_gemini(api_key)
-                # Create a progress bar
-                progress_bar = st.progress(0)
-                status_text = st.empty()
-                file_progress = 0
-                for file_index, uploaded_file in enumerate(uploaded_files):
-                    st.info(f"Starting processing for file {file_index+1}/{total_files}: {uploaded_file.name}") # Log individual file start
-                    # Update file progress
-                    file_progress = (file_index) / total_files
-                    progress_bar.progress(overall_progress) # Corrected variable name
-                    status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
-                    # Get PDF reader and page count
-                    try:
-                        pdf_reader, total_pages = read_pdf_pages(uploaded_file)
-                        if total_pages == 0:
-                            st.warning(f"No pages found in {uploaded_file.name}. Skipping file.") # Log
-                            continue
-                        with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
-                            # Define progress callback for page-by-page processing
-                            def update_page_progress(page_progress, message):
-                                # Calculate overall progress (file progress + current file's contribution)
-                                overall_progress = file_progress + (page_progress * (1/total_files))
-                                progress_bar.progress(overall_progress)
-                                status_text.text(f"File {file_index+1}/{total_files}: {message}")
-                            # Process the PDF page by page
-                            st.info(f"Calling process_pdf_pages for {uploaded_file.name}...") # Log
-                            file_transactions = process_pdf_pages(
-                                model,
-                                pdf_reader,
-                                total_pages,
-                                progress_callback=update_page_progress
-                            )
-                            # Add transactions from this file to overall list
-                            all_transactions.extend(file_transactions)
-                            st.info(f"Finished processing {uploaded_file.name}. Extracted {len(file_transactions)} transactions.") # Log file completion
-                    except Exception as e:
-                        st.error(f"Error processing {uploaded_file.name}: {str(e)}") # Log specific file error
-                        st.exception(e) # Show traceback
-                        continue
-                # Complete the progress bar
-                progress_bar.progress(1.0)
-                status_text.text(f"Completed processing {total_files} files!")
-                st.success(f"All PDF files processed. Total transactions collected: {len(all_transactions)}.") # Log overall completion
-            except Exception as e:
-                st.error(f"Overall error during PDF document processing: {str(e)}") # Log general error during PDF handling
-                st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
-                st.exception(e) # Show traceback
     elif input_type == "CSV Upload":
         uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
         if uploaded_csv:
-            st.info(f"User uploaded CSV file: {uploaded_csv.name}.") # Log
-            try:
-                df = pd.read_csv(uploaded_csv)
-                # Drop 'Unnamed:' columns from the uploaded CSV
-                df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
-                st.write("CSV Data Preview:")
-                st.dataframe(df.head())
-                # Convert dataframe to list of transaction dictionaries
-                transactions = df.to_dict(orient='records')
-                all_transactions.extend(transactions)
-                st.success(f"Successfully loaded {len(transactions)} transactions from CSV.") # Log
-            except Exception as e:
-                st.error(f"Error processing CSV file: {str(e)}") # Log CSV error
-                st.exception(e)
-    # If transactions are loaded, show DataFrame and update date ranges
-    if all_transactions:
-        st.info("Consolidating and displaying all extracted transactions.") # Log
-        df = pd.DataFrame(all_transactions)
-        # Drop 'Unnamed:' columns from the final DataFrame
-        if not df.empty:
             df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
-            try:
-                # Process dates and extract min/max dates for date range inputs
-                st.info("Parsing transaction dates and determining date range.") # Log
-                df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
-                # Get min and max dates from transactions
-                if not df['Date'].isna().all():
-                    min_date = df['Date'].min().date()
-                    max_date = df['Date'].max().date()
-                    # Update session state with actual transaction date range
-                    st.session_state['min_date'] = min_date
-                    st.session_state['max_date'] = max_date
-                    st.info(f"Determined transaction date range: {min_date} to {max_date}.") # Log
-                else:
-                    st.warning("Could not determine valid date range from transactions. Using default dates.") # Log
-                # Format dates for display
-                df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
-            except Exception as e:
-                st.warning("Some transaction dates could not be formatted correctly.")
-                st.exception(e)
-            st.success("Transactions loaded successfully!")
-            st.write("### Extracted Transactions")
-            st.dataframe(df)
-        else:
-            st.warning("No valid transactions could be extracted from the documents.")
     else:
-        st.info("No transactions loaded yet. Upload files to begin.") # Initial state log
-    # Financial report generation parameters
     st.write("### Generate Financial Report")
     col1, col2 = st.columns(2)
     with col1:
-        # Use the min_date from transactions if available, otherwise use default
         start_date = st.date_input("Start Date", st.session_state['min_date'])
     with col2:
-        # Use the max_date from transactions if available, otherwise use default
         end_date = st.date_input("End Date", st.session_state['max_date'])
     statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
     if st.button("Generate Financial Report"):
-        st.info(f"User clicked 'Generate Financial Report' for {statement_type} from {start_date} to {end_date}.")
-        if not all_transactions:
             st.error("No transactions available to generate report. Please upload files first.")
         else:
-            # Filter transactions by date
-            st.info(f"Filtering {len(all_transactions)} transactions for the period {start_date} to {end_date}...")
-            filtered_transactions = []
-            for transaction in all_transactions:
-                try:
-                    transaction_date = datetime.strptime(transaction.get('Date'), '%d/%m/%Y').date()
-                    if start_date <= transaction_date <= end_date:
-                        filtered_transactions.append(transaction)
-                except (ValueError, TypeError):
-                    st.warning(f"Could not parse date for transaction, skipping: {transaction}")
-                    continue
-            if not filtered_transactions:
-                st.warning("No transactions found within the selected date range. Please adjust dates or upload relevant files.")
             else:
-                st.info(f"Found {len(filtered_transactions)} transactions within the selected date range.")
                 try:
-                    model1 = configure_gemini1(api_key)
-                    # Decide whether to use batched or regular processing
-                    if len(filtered_transactions) > 600:
-                        st.info(f"Large dataset detected ({len(filtered_transactions)} transactions). Using batched processing...")
-                        with st.spinner("Generating batched financial report..."):
-                            report_text = generate_batched_financial_report(
-                                model1, filtered_transactions, start_date, end_date, statement_type
-                            )
-                    else:
-                        st.info("Using standard processing for smaller dataset...")
-                        combined_json = {"transactions": filtered_transactions}
-                        with st.spinner("Generating financial report..."):
-                            report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
-                    if report_text:
-                        st.success("Financial report generated successfully!")
-                        # Display the report as markdown
-                        st.markdown("### Financial Report Preview")
-                        st.markdown(report_text)
-                        # Create PDF from markdown
-                        try:
-                            st.info("Generating PDF from the report...")
                             pdf_buffer = create_pdf_report(report_text)
                             st.download_button(
                                 label="Download Financial Report as PDF",
-                                data=pdf_buffer.getvalue(),
                                 file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
                                 mime="application/pdf"
                             )
-                            st.success("PDF download ready.")
-                        except Exception as e:
-                            st.error(f"Error generating PDF: {str(e)}")
-                            st.exception(e)
                 except Exception as e:
-                    st.error(f"Error generating financial report: {str(e)}")
-                    if "504" in str(e):
-                        st.info("Consider using a smaller date range or fewer transactions.")
                     st.exception(e)
 if __name__ == "__main__":

 import json
 import os
 import time
+from datetime import datetime, date
 from io import BytesIO
 import pandas as pd
 import streamlit as st
 import google.generativeai as genai
 import pypdf
 from fpdf import FPDF
 from google.api_core import exceptions
+import markdown
+from bs4 import BeautifulSoup
+# Configure API key for Gemini - Ensure this is set in your environment variables
 api_key = os.getenv('Gemini')
 def configure_gemini(api_key):
+    """
+    Configures the Gemini model for transaction extraction as specified by the user.
+    """
+    st.info("Configuring Gemini API for transaction extraction...")
     genai.configure(api_key=api_key)
+    # Using the model specified by the user for this task
     return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
 def configure_gemini1(api_key):
+    """
+    Configures the Gemini model for report generation as specified by the user.
+    """
+    st.info("Configuring Gemini API for report generation...")
     genai.configure(api_key=api_key)
+    # Using the state-of-the-art model for high-quality report formatting
     return genai.GenerativeModel('gemini-2.5-pro')
 def read_pdf_pages(file_obj):
+    st.info(f"Reading PDF pages from {file_obj.name}...")
+    file_obj.seek(0)
     pdf_reader = pypdf.PdfReader(file_obj)
     total_pages = len(pdf_reader.pages)
+    st.info(f"Found {total_pages} pages in PDF.")
     return pdf_reader, total_pages
 def extract_page_text(pdf_reader, page_num):
     if page_num < len(pdf_reader.pages):
         text = pdf_reader.pages[page_num].extract_text()
+        if not text or not text.strip():
+            st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.")
         return text if text else ""
     return ""
 def process_with_gemini(model, text):
     prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
     - Date (format DD/MM/YYYY)
         ]
     }"""
     try:
         response = model.generate_content([prompt, text])
+        time.sleep(6) # Retaining original sleep time as per user's working code
         return response.text
+    except exceptions.GoogleAPICallError as e:
+        st.error(f"A Google API call error occurred during transaction extraction: {e}")
+        if "context length" in str(e):
+            st.warning("The text on a single PDF page may be too long for the extraction model.")
+        return None
     except Exception as e:
+        st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}")
         return None
 def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
     all_transactions = []
+    st.info(f"Starting page-by-page PDF processing for {total_pages} pages...")
     for page_num in range(total_pages):
         if progress_callback:
             progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
         page_text = extract_page_text(pdf_reader, page_num)
         if not page_text.strip():
+            continue
+        st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...")
         json_response = process_with_gemini(model, page_text)
         if json_response:
+            # A more robust regex to find the JSON block
+            match = re.search(r'\{.*\}', json_response, re.DOTALL)
+            if not match:
+                st.warning(f"No valid JSON object found in Gemini response for page {page_num + 1}.")
+                continue
+            json_str = match.group(0)
             try:
                 data = json.loads(json_str)
                 transactions = data.get('transactions', [])
                 if transactions:
+                    st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.")
                     all_transactions.extend(transactions)
             except json.JSONDecodeError:
+                st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}.")
+                continue
         else:
+            st.warning(f"Gemini returned no response for page {page_num + 1}.")
+    st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.")
     return all_transactions
+def aggregate_financial_data(transactions: list, statement_type: str):
+    """
+    Aggregates transaction data using Pandas for high performance and accuracy.
+    This function does the heavy lifting locally, preparing a small summary for the LLM.
+    """
+    st.info(f"Performing local financial aggregation for {len(transactions)} transactions...")
+    if not transactions:
+        st.warning("No transactions to aggregate.")
         return None
+    df = pd.DataFrame(transactions)
+    # --- Data Cleaning and Preparation ---
+    df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0)
+    df['Type'] = df['Type'].str.lower()
+    # --- Core Financial Calculations ---
+    total_income = df[df['Type'] == 'income']['Amount'].sum()
+    total_expenses = df[df['Type'] == 'expense']['Amount'].sum()
+    net_position = total_income - total_expenses
+    # --- Build the Aggregated Data Structure ---
+    aggregated_data = {
+        "total_income": total_income,
+        "total_expenses": total_expenses,
+        "net_position": net_position,
+        "transaction_count": len(df)
     }
+    # --- Statement-Specific Aggregations ---
+    if statement_type == "Income Statement":
+        expense_breakdown = df[df['Type'] == 'expense'].groupby('Category_of_expense')['Amount'].sum().round(2).to_dict()
+        aggregated_data["expense_breakdown"] = expense_breakdown
+        income_breakdown = df[df['Type'] == 'income'].groupby('Customer_name')['Amount'].sum().round(2).to_dict()
+        aggregated_data["income_breakdown"] = income_breakdown
+    elif statement_type == "Cashflow Statement":
+        aggregated_data["operating_cash_flow"] = net_position
+        aggregated_data["cash_inflows"] = total_income
+        aggregated_data["cash_outflows"] = total_expenses
+    elif statement_type == "Balance Sheet":
+        aggregated_data["notes"] = "Balance Sheets require asset and liability balances, not just transaction flows. This data can only show the net change in cash over the period."
+    st.success("Local financial aggregation complete.")
+    return aggregated_data
+def generate_financial_report(model, aggregated_data, start_date, end_date, statement_type):
+    """
+    Generates a financial report by sending a small, pre-aggregated summary to the LLM.
+    The LLM's job is to format this data professionally, not to calculate it.
+    """
+    st.info(f"Preparing to generate {statement_type} with pre-aggregated data...")
+    prompt = f"""
+Based on the following pre-aggregated financial summary JSON data:
+{json.dumps(aggregated_data, indent=2)}
+Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
+Specific Formatting and Content Requirements:
+Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
+Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
+Consistent Formatting: Maintain consistent formatting for monetary values (using "R" for South African Rand), dates, and alignment.
+Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate.
+Descriptive Line Items: Use the provided aggregated data to create clear line items.
+Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends or key performance indicators derived from the provided summary data.
+Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture.
+Special Case for Balance Sheet: If the request is for a "Balance Sheet," explain professionally that a balance sheet cannot be generated from transaction data alone, as it requires a snapshot of assets, liabilities, and equity. Then, present the available cash flow information as a helpful alternative.
+Format the entire report in Markdown for better visual structure.
+Do not name the company if a name is not there; refer to it as "The Business". Return just the report and nothing else.
+"""
     try:
+        st.info("Sending request to Gemini for final report formatting...")
         response = model.generate_content([prompt])
+        time.sleep(7) # Retaining original sleep time
+        st.success("Successfully received formatted financial report from Gemini.")
         return response.text
+    except exceptions.GoogleAPICallError as e:
+        st.error(f"A Google API call error occurred during report generation: {e}")
         return None
+    except Exception as e:
+        st.error(f"An unexpected error occurred during Gemini report generation: {e}")
         return None
+# --- PDF Generation Logic (Unaltered as per your request) ---
 class PDF_Generator(FPDF):
     def add_html_element(self, tag, styles):
         text = tag.get_text()
         tag_name = tag.name.lower()
         current_style = ''
+        if 'b' in styles or 'strong' in styles: current_style += 'B'
+        if 'i' in styles or 'em' in styles: current_style += 'I'
+        if not current_style: self.set_font('helvetica', '', self.font_size_pt)
         if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
             level = int(tag_name[1])
             font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
+            self.set_font('helvetica', 'B', font_size)
+            self.multi_cell(0, font_size * 0.5, text, align='L')
+            self.ln(font_size * 0.3)
+            self.set_font('helvetica', '', 10)
         elif tag_name == 'p':
             self.set_font('helvetica', current_style, 10)
+            self.multi_cell(0, 5, text, align='L')
+            self.ln(3)
         elif tag_name == 'ul':
             self.ln(2)
             for item in tag.find_all('li', recursive=False):
+                self.set_font('helvetica', '', 10)
                 item_text = item.get_text()
+                self.cell(5, 5, chr(127))
                 self.multi_cell(0, 5, item_text, align='L')
                 self.ln(1)
             self.ln(3)
             self.ln(5)
             self.process_table(tag)
             self.ln(5)
+        elif tag_name == 'br': self.ln(5)
         elif tag_name == 'hr':
             self.ln(2)
             self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
             self.ln(4)
         else:
+            if text.strip():
                 self.set_font('helvetica', current_style, 10)
                 self.multi_cell(0, 5, text, align='L')
                 self.ln(1)
     def process_table(self, table_tag):
         rows = table_tag.find_all('tr')
+        if not rows: return
         header_cells = rows[0].find_all(['th', 'td'])
         num_cols = len(header_cells)
+        if num_cols == 0: return
         effective_width = self.w - self.l_margin - self.r_margin
         col_width = effective_width / num_cols
+        default_cell_height = 6
         is_first_row = True
         for row in rows:
             cells = row.find_all(['th', 'td'])
+            if len(cells) != num_cols: continue
             is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
             for i, cell in enumerate(cells):
                 cell_text = cell.get_text().strip()
                 if is_header_row:
+                    self.set_font('helvetica', 'B', 9)
+                    self.set_fill_color(230, 230, 230)
                     fill = True
                 else:
+                    self.set_font('helvetica', '', 9)
+                    fill = False
+                self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, new_x="RIGHT", new_y="TOP")
+            self.ln(default_cell_height)
+            is_first_row = False
 def create_pdf_report(report_text):
     if not report_text:
+        st.warning("Report text is empty, skipping PDF generation.")
         raise ValueError("Input report_text cannot be empty.")
     try:
+        st.info("Starting PDF generation from markdown report...")
+        cleaned_md = re.sub(r'```markdown|```', '', report_text, flags=re.MULTILINE).strip()
+        html_content = markdown.markdown(cleaned_md, extensions=['tables'])
         soup = BeautifulSoup(html_content, 'html.parser')
         pdf = PDF_Generator()
+        pdf.set_auto_page_break(auto=True, margin=15)
+        pdf.set_left_margin(15)
+        pdf.set_right_margin(15)
         pdf.add_page()
+        pdf.set_font('helvetica', '', 10)
         for element in soup.find_all(recursive=False):
+            pdf.add_html_element(element, set())
+        st.info("Content added to PDF. Outputting PDF to buffer...")
+        pdf_output = pdf.output(dest='S').encode('latin-1')
+        st.success("PDF report generated successfully.")
         return BytesIO(pdf_output)
     except Exception as e:
+        st.error(f"Failed to generate PDF: {e}")
+        st.exception(e)
+        raise
 def main():
     st.title("Quantitlytix AI")
     st.markdown("*Bank Statement Parser & Financial Report Generator*")
     if 'min_date' not in st.session_state:
+        st.session_state['min_date'] = date(2024, 1, 1)
     if 'max_date' not in st.session_state:
+        st.session_state['max_date'] = date.today()
+    if 'transactions' not in st.session_state:
+        st.session_state['transactions'] = []
     input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
     if input_type == "Bulk Bank Statement Upload":
         uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
         if uploaded_files:
+            st.info(f"User uploaded {len(uploaded_files)} PDF file(s).")
+            model = configure_gemini(api_key)
+            progress_bar = st.progress(0)
+            all_transactions = []
+            for i, file in enumerate(uploaded_files):
+                st.text(f"Processing {file.name}...")
+                pdf_reader, total_pages = read_pdf_pages(file)
+                if total_pages > 0:
+                    file_transactions = process_pdf_pages(model, pdf_reader, total_pages)
+                    all_transactions.extend(file_transactions)
+                progress_bar.progress((i + 1) / len(uploaded_files))
+            st.session_state['transactions'] = all_transactions
+            st.success(f"All PDF files processed. Total transactions collected: {len(st.session_state['transactions'])}.")
     elif input_type == "CSV Upload":
         uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
         if uploaded_csv:
+            st.info(f"User uploaded CSV file: {uploaded_csv.name}.")
+            df = pd.read_csv(uploaded_csv)
             df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
+            st.session_state['transactions'] = df.to_dict(orient='records')
+            st.success(f"Successfully loaded {len(st.session_state['transactions'])} transactions from CSV.")
+    if st.session_state['transactions']:
+        st.info("Consolidating and displaying all extracted transactions.")
+        df = pd.DataFrame(st.session_state['transactions'])
+        df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
+        df.dropna(subset=['Date'], inplace=True)
+        if not df.empty:
+            min_date = df['Date'].min().date()
+            max_date = df['Date'].max().date()
+            st.session_state['min_date'] = min_date
+            st.session_state['max_date'] = max_date
+        st.write("### Extracted Transactions")
+        st.dataframe(df.astype(str))
     else:
+        st.info("No transactions loaded yet. Upload files to begin.")
     st.write("### Generate Financial Report")
     col1, col2 = st.columns(2)
     with col1:
         start_date = st.date_input("Start Date", st.session_state['min_date'])
     with col2:
         end_date = st.date_input("End Date", st.session_state['max_date'])
     statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
     if st.button("Generate Financial Report"):
+        if not st.session_state['transactions']:
             st.error("No transactions available to generate report. Please upload files first.")
         else:
+            df = pd.DataFrame(st.session_state['transactions'])
+            df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
+            mask = (df['Date'] >= pd.to_datetime(start_date)) & (df['Date'] <= pd.to_datetime(end_date))
+            filtered_df = df.loc[mask]
+            if filtered_df.empty:
+                st.warning("No transactions found within the selected date range.")
             else:
+                st.info(f"Found {len(filtered_df)} transactions within the selected date range.")
+                filtered_transactions_list = filtered_df.to_dict(orient='records')
                 try:
+                    with st.spinner("Aggregating financial data locally..."):
+                        aggregated_summary = aggregate_financial_data(filtered_transactions_list, statement_type)
+                    if aggregated_summary:
+                        with st.spinner("Generating formatted report with Gemini..."):
+                            model1 = configure_gemini1(api_key)
+                            report_text = generate_financial_report(model1, aggregated_summary, start_date, end_date, statement_type)
+                        if report_text:
+                            st.success("Financial report generated successfully!")
+                            st.markdown("### Financial Report Preview")
+                            st.markdown(report_text, unsafe_allow_html=True)
                             pdf_buffer = create_pdf_report(report_text)
                             st.download_button(
                                 label="Download Financial Report as PDF",
+                                data=pdf_buffer,
                                 file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
                                 mime="application/pdf"
                             )
+                        else:
+                            st.error("Failed to generate the financial report from the aggregated data.")
                 except Exception as e:
+                    st.error(f"An unexpected error occurred during the report generation process: {e}")
                     st.exception(e)
 if __name__ == "__main__":