Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,52 +2,54 @@ import re
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
-
from datetime import datetime, date
|
| 6 |
from io import BytesIO
|
| 7 |
-
import requests
|
| 8 |
import pandas as pd
|
| 9 |
import streamlit as st
|
| 10 |
import google.generativeai as genai
|
| 11 |
import pypdf
|
| 12 |
from fpdf import FPDF
|
| 13 |
-
from fpdf.enums import XPos, YPos
|
| 14 |
-
import markdown
|
| 15 |
from google.api_core import exceptions
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
-
# Configure API key for Gemini
|
| 19 |
api_key = os.getenv('Gemini')
|
| 20 |
|
| 21 |
def configure_gemini(api_key):
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
| 23 |
genai.configure(api_key=api_key)
|
|
|
|
| 24 |
return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
|
| 25 |
|
| 26 |
def configure_gemini1(api_key):
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
genai.configure(api_key=api_key)
|
|
|
|
| 29 |
return genai.GenerativeModel('gemini-2.5-pro')
|
| 30 |
-
|
| 31 |
-
# Read PDF content page by page from a file-like object
|
| 32 |
def read_pdf_pages(file_obj):
|
| 33 |
-
st.info(f"Reading PDF pages from {file_obj.name}...")
|
| 34 |
-
file_obj.seek(0)
|
| 35 |
pdf_reader = pypdf.PdfReader(file_obj)
|
| 36 |
total_pages = len(pdf_reader.pages)
|
| 37 |
-
st.info(f"Found {total_pages} pages in PDF.")
|
| 38 |
return pdf_reader, total_pages
|
| 39 |
|
| 40 |
-
# Extract text from a specific page
|
| 41 |
def extract_page_text(pdf_reader, page_num):
|
| 42 |
-
# st.debug(f"Extracting text from page {page_num + 1}...") # Too verbose for general logging
|
| 43 |
if page_num < len(pdf_reader.pages):
|
| 44 |
text = pdf_reader.pages[page_num].extract_text()
|
| 45 |
-
if not text.strip():
|
| 46 |
-
st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.")
|
| 47 |
return text if text else ""
|
| 48 |
return ""
|
| 49 |
|
| 50 |
-
# Process a chunk of PDF text with Gemini to extract transactions as JSON
|
| 51 |
def process_with_gemini(model, text):
|
| 52 |
prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
|
| 53 |
- Date (format DD/MM/YYYY)
|
|
@@ -74,422 +76,166 @@ def process_with_gemini(model, text):
|
|
| 74 |
]
|
| 75 |
}"""
|
| 76 |
try:
|
| 77 |
-
# st.debug("Sending text chunk to Gemini for transaction extraction...") # Too verbose
|
| 78 |
response = model.generate_content([prompt, text])
|
| 79 |
-
time.sleep(6)
|
| 80 |
-
# st.debug("Received response from Gemini for transaction extraction.") # Too verbose
|
| 81 |
return response.text
|
| 82 |
-
except exceptions.
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
st.error(f"Gemini API error during transaction extraction: {e}") # Log other API errors
|
| 88 |
-
raise
|
| 89 |
except Exception as e:
|
| 90 |
-
st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}")
|
| 91 |
return None
|
| 92 |
|
| 93 |
-
# Process PDF page by page to handle large files
|
| 94 |
def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
| 95 |
all_transactions = []
|
| 96 |
-
st.info(f"Starting page-by-page PDF processing for {total_pages} pages...")
|
| 97 |
-
|
| 98 |
-
# Process pages individually or in small chunks
|
| 99 |
for page_num in range(total_pages):
|
| 100 |
-
# Update progress if callback provided
|
| 101 |
if progress_callback:
|
| 102 |
progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
|
| 103 |
-
|
| 104 |
-
# Extract text from current page
|
| 105 |
page_text = extract_page_text(pdf_reader, page_num)
|
| 106 |
-
|
| 107 |
if not page_text.strip():
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
# Process the page with Gemini
|
| 112 |
-
st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...") # Log
|
| 113 |
json_response = process_with_gemini(model, page_text)
|
| 114 |
-
|
| 115 |
if json_response:
|
| 116 |
-
#
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
json_str = json_response[start_idx:end_idx]
|
| 125 |
-
json_str = json_str.replace('```json', '').replace('```', '')
|
| 126 |
-
|
| 127 |
try:
|
| 128 |
data = json.loads(json_str)
|
| 129 |
transactions = data.get('transactions', [])
|
| 130 |
if transactions:
|
| 131 |
-
st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.")
|
| 132 |
all_transactions.extend(transactions)
|
| 133 |
-
else:
|
| 134 |
-
st.info(f"No transactions found on page {page_num + 1} based on Gemini's analysis.") # Log no transactions found on page
|
| 135 |
except json.JSONDecodeError:
|
| 136 |
-
st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}.
|
| 137 |
-
continue
|
| 138 |
else:
|
| 139 |
-
st.warning(f"Gemini returned no response for page {page_num + 1}.
|
| 140 |
|
| 141 |
-
st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.")
|
| 142 |
return all_transactions
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
|
| 154 |
-
Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
|
| 155 |
-
Consistent Formatting: Maintain consistent formatting for monetary values (e.g., using "R" for South African Rand if applicable and discernible from the data, comma separators for thousands), dates, and alignment.
|
| 156 |
-
Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate to provide a clear understanding of the financial performance or position.
|
| 157 |
-
Descriptive Line Items: Use clear and concise descriptions for each transaction or aggregated account based on the provided JSON data.
|
| 158 |
-
Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends, notable figures, or key performance indicators derived from the data within the statement. This should be written in plain, understandable English, potentially highlighting aspects particularly relevant to the economic context of Zimbabwe if discernible from the data.
|
| 159 |
-
Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture presented in the {statement_type}.
|
| 160 |
-
|
| 161 |
-
Format the report in Markdown for better visual structure.
|
| 162 |
-
Do not name the company if name is not there and return just the report and nothing else."""
|
| 163 |
-
try:
|
| 164 |
-
st.info("Sending request to Gemini for financial report generation...") # Log
|
| 165 |
-
response = model.generate_content([prompt])
|
| 166 |
-
time.sleep(7) # Sleep for 7 seconds to work around rate limit
|
| 167 |
-
st.success("Successfully received financial report from Gemini.") # Log success
|
| 168 |
-
return response.text
|
| 169 |
-
except exceptions.ServiceUnavailable as e:
|
| 170 |
-
if e.response.status_code == 504:
|
| 171 |
-
st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
|
| 172 |
-
st.session_state['last_error'] = "504" # Store the error in session state
|
| 173 |
-
return None
|
| 174 |
-
else:
|
| 175 |
-
st.error(f"Gemini API error during report generation: {e}") # Log other API errors
|
| 176 |
-
raise
|
| 177 |
-
except Exception as e:
|
| 178 |
-
st.error(f"An unexpected error occurred during Gemini report generation: {e}") # Catch other potential errors
|
| 179 |
return None
|
| 180 |
|
| 181 |
-
|
| 182 |
-
"""Split transactions into smaller batches for processing."""
|
| 183 |
-
batches = []
|
| 184 |
-
for i in range(0, len(transactions), batch_size):
|
| 185 |
-
batch = transactions[i:i + batch_size]
|
| 186 |
-
batches.append(batch)
|
| 187 |
-
st.info(f"Split {len(transactions)} transactions into {len(batches)} batches of up to {batch_size} transactions each.")
|
| 188 |
-
return batches
|
| 189 |
-
|
| 190 |
-
def generate_batch_summary(model, json_data, start_date, end_date, statement_type, batch_num, total_batches):
|
| 191 |
-
"""Generate a summary analysis for a batch of transactions."""
|
| 192 |
-
st.info(f"Processing batch {batch_num}/{total_batches} with {len(json_data['transactions'])} transactions...")
|
| 193 |
-
|
| 194 |
-
prompt = f"""Analyze this batch of transactions (batch {batch_num} of {total_batches}) for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}.
|
| 195 |
-
|
| 196 |
-
Transaction data:
|
| 197 |
-
{json.dumps(json_data)}
|
| 198 |
-
|
| 199 |
-
Create a structured summary focusing on aggregation and categorization. Return ONLY the following JSON structure:
|
| 200 |
-
|
| 201 |
-
{{
|
| 202 |
-
"batch_info": {{
|
| 203 |
-
"batch_number": {batch_num},
|
| 204 |
-
"total_batches": {total_batches},
|
| 205 |
-
"transaction_count": {len(json_data['transactions'])},
|
| 206 |
-
"date_range": "{start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}"
|
| 207 |
-
}},
|
| 208 |
-
"financial_summary": {{
|
| 209 |
-
"total_income": 0,
|
| 210 |
-
"total_expenses": 0,
|
| 211 |
-
"net_position": 0
|
| 212 |
-
}},
|
| 213 |
-
"income_breakdown": {{
|
| 214 |
-
"by_customer": {{}},
|
| 215 |
-
"by_month": {{}}
|
| 216 |
-
}},
|
| 217 |
-
"expense_breakdown": {{
|
| 218 |
-
"by_category": {{}},
|
| 219 |
-
"by_month": {{}}
|
| 220 |
-
}},
|
| 221 |
-
"key_transactions": [
|
| 222 |
-
// Top 5 largest transactions (income and expense)
|
| 223 |
-
],
|
| 224 |
-
"monthly_totals": {{
|
| 225 |
-
// Format: "YYYY-MM": {{"income": 0, "expenses": 0, "net": 0}}
|
| 226 |
-
}}
|
| 227 |
-
}}
|
| 228 |
-
|
| 229 |
-
Focus on numerical aggregation and categorization. Be precise with calculations."""
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
else:
|
| 240 |
-
st.error(f"API error processing batch {batch_num}: {e}")
|
| 241 |
-
raise
|
| 242 |
-
except Exception as e:
|
| 243 |
-
st.error(f"Error processing batch {batch_num}: {e}")
|
| 244 |
-
return None
|
| 245 |
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
"
|
| 252 |
-
"total_transactions": 0,
|
| 253 |
-
"date_range": f"{start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}",
|
| 254 |
-
"financial_summary": {
|
| 255 |
-
"total_income": 0,
|
| 256 |
-
"total_expenses": 0,
|
| 257 |
-
"net_position": 0
|
| 258 |
-
},
|
| 259 |
-
"income_breakdown": {
|
| 260 |
-
"by_customer": {},
|
| 261 |
-
"by_month": {}
|
| 262 |
-
},
|
| 263 |
-
"expense_breakdown": {
|
| 264 |
-
"by_category": {},
|
| 265 |
-
"by_month": {}
|
| 266 |
-
},
|
| 267 |
-
"key_transactions": [],
|
| 268 |
-
"monthly_totals": {}
|
| 269 |
}
|
| 270 |
-
|
| 271 |
-
# Process each batch summary
|
| 272 |
-
for batch_data in batch_summaries:
|
| 273 |
-
if not batch_data:
|
| 274 |
-
continue
|
| 275 |
-
|
| 276 |
-
try:
|
| 277 |
-
# Extract JSON from response if needed
|
| 278 |
-
if isinstance(batch_data, str):
|
| 279 |
-
start_idx = batch_data.find('{')
|
| 280 |
-
end_idx = batch_data.rfind('}') + 1
|
| 281 |
-
if start_idx != -1 and end_idx > start_idx:
|
| 282 |
-
json_str = batch_data[start_idx:end_idx]
|
| 283 |
-
batch_data = json.loads(json_str)
|
| 284 |
-
else:
|
| 285 |
-
st.warning("Could not extract JSON from batch summary")
|
| 286 |
-
continue
|
| 287 |
-
|
| 288 |
-
# Aggregate financial summary
|
| 289 |
-
if 'financial_summary' in batch_data:
|
| 290 |
-
fs = batch_data['financial_summary']
|
| 291 |
-
consolidated['financial_summary']['total_income'] += fs.get('total_income', 0)
|
| 292 |
-
consolidated['financial_summary']['total_expenses'] += fs.get('total_expenses', 0)
|
| 293 |
-
|
| 294 |
-
# Aggregate transaction count
|
| 295 |
-
if 'batch_info' in batch_data:
|
| 296 |
-
consolidated['total_transactions'] += batch_data['batch_info'].get('transaction_count', 0)
|
| 297 |
-
|
| 298 |
-
# Merge income breakdown by customer
|
| 299 |
-
if 'income_breakdown' in batch_data:
|
| 300 |
-
for customer, amount in batch_data['income_breakdown'].get('by_customer', {}).items():
|
| 301 |
-
consolidated['income_breakdown']['by_customer'][customer] = \
|
| 302 |
-
consolidated['income_breakdown']['by_customer'].get(customer, 0) + amount
|
| 303 |
-
|
| 304 |
-
# Merge income by month
|
| 305 |
-
for month, amount in batch_data['income_breakdown'].get('by_month', {}).items():
|
| 306 |
-
consolidated['income_breakdown']['by_month'][month] = \
|
| 307 |
-
consolidated['income_breakdown']['by_month'].get(month, 0) + amount
|
| 308 |
-
|
| 309 |
-
# Merge expense breakdown by category
|
| 310 |
-
if 'expense_breakdown' in batch_data:
|
| 311 |
-
for category, amount in batch_data['expense_breakdown'].get('by_category', {}).items():
|
| 312 |
-
consolidated['expense_breakdown']['by_category'][category] = \
|
| 313 |
-
consolidated['expense_breakdown']['by_category'].get(category, 0) + amount
|
| 314 |
-
|
| 315 |
-
# Merge expenses by month
|
| 316 |
-
for month, amount in batch_data['expense_breakdown'].get('by_month', {}).items():
|
| 317 |
-
consolidated['expense_breakdown']['by_month'][month] = \
|
| 318 |
-
consolidated['expense_breakdown']['by_month'].get(month, 0) + amount
|
| 319 |
-
|
| 320 |
-
# Collect key transactions
|
| 321 |
-
if 'key_transactions' in batch_data:
|
| 322 |
-
consolidated['key_transactions'].extend(batch_data.get('key_transactions', []))
|
| 323 |
-
|
| 324 |
-
# Merge monthly totals
|
| 325 |
-
if 'monthly_totals' in batch_data:
|
| 326 |
-
for month, totals in batch_data['monthly_totals'].items():
|
| 327 |
-
if month not in consolidated['monthly_totals']:
|
| 328 |
-
consolidated['monthly_totals'][month] = {"income": 0, "expenses": 0, "net": 0}
|
| 329 |
-
|
| 330 |
-
consolidated['monthly_totals'][month]['income'] += totals.get('income', 0)
|
| 331 |
-
consolidated['monthly_totals'][month]['expenses'] += totals.get('expenses', 0)
|
| 332 |
-
consolidated['monthly_totals'][month]['net'] += totals.get('net', 0)
|
| 333 |
-
|
| 334 |
-
except json.JSONDecodeError as e:
|
| 335 |
-
st.warning(f"Could not parse batch summary JSON: {e}")
|
| 336 |
-
continue
|
| 337 |
-
except Exception as e:
|
| 338 |
-
st.warning(f"Error processing batch summary: {e}")
|
| 339 |
-
continue
|
| 340 |
-
|
| 341 |
-
# Calculate final net position
|
| 342 |
-
consolidated['financial_summary']['net_position'] = \
|
| 343 |
-
consolidated['financial_summary']['total_income'] - consolidated['financial_summary']['total_expenses']
|
| 344 |
-
|
| 345 |
-
st.success(f"Successfully consolidated data from {len(batch_summaries)} batches covering {consolidated['total_transactions']} transactions.")
|
| 346 |
-
return consolidated
|
| 347 |
-
|
| 348 |
-
def generate_final_report(model, consolidated_data, statement_type):
|
| 349 |
-
"""Generate the final comprehensive report using consolidated batch data."""
|
| 350 |
-
st.info("Generating final comprehensive report from consolidated data...")
|
| 351 |
-
|
| 352 |
-
prompt = f"""Using this consolidated financial data, generate a comprehensive {statement_type} report:
|
| 353 |
-
|
| 354 |
-
Consolidated Data:
|
| 355 |
-
{json.dumps(consolidated_data, indent=2)}
|
| 356 |
-
|
| 357 |
-
Generate a detailed {statement_type} report with the following requirements:
|
| 358 |
-
|
| 359 |
-
1. **Professional Format**: Use standard South African accounting format and terminology
|
| 360 |
-
2. **Clear Structure**: Organize with proper headings, subheadings, and sections
|
| 361 |
-
3. **Comprehensive Analysis**: Include:
|
| 362 |
-
- Executive Summary
|
| 363 |
-
- Detailed breakdown by categories/customers
|
| 364 |
-
- Monthly trend analysis
|
| 365 |
-
- Key performance indicators
|
| 366 |
-
- Notable transactions and patterns
|
| 367 |
-
4. **Visual Elements**: Use tables, proper formatting for better readability
|
| 368 |
-
5. **Insights**: Provide meaningful business insights based on the data
|
| 369 |
-
6. **Currency**: Use "R" for South African Rand where appropriate
|
| 370 |
-
|
| 371 |
-
Return the report in well-formatted Markdown. Do not include company name if not available.
|
| 372 |
-
Focus on creating a professional, comprehensive financial statement that provides clear insights into the business performance."""
|
| 373 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
try:
|
|
|
|
| 375 |
response = model.generate_content([prompt])
|
| 376 |
-
time.sleep(
|
| 377 |
-
st.success("
|
| 378 |
return response.text
|
| 379 |
-
except exceptions.
|
| 380 |
-
|
| 381 |
-
st.error("Final report generation timed out. The consolidated data might be too large.")
|
| 382 |
-
return None
|
| 383 |
-
else:
|
| 384 |
-
st.error(f"API error generating final report: {e}")
|
| 385 |
-
raise
|
| 386 |
-
except Exception as e:
|
| 387 |
-
st.error(f"Error generating final report: {e}")
|
| 388 |
return None
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
"""Main function to generate financial report using batch processing."""
|
| 392 |
-
st.info(f"Starting batched financial report generation for {len(filtered_transactions)} transactions...")
|
| 393 |
-
|
| 394 |
-
# Step 1: Split transactions into batches
|
| 395 |
-
transaction_batches = chunk_transactions(filtered_transactions, batch_size)
|
| 396 |
-
|
| 397 |
-
# Step 2: Process each batch
|
| 398 |
-
batch_summaries = []
|
| 399 |
-
progress_bar = st.progress(0)
|
| 400 |
-
status_text = st.empty()
|
| 401 |
-
|
| 402 |
-
for i, batch in enumerate(transaction_batches):
|
| 403 |
-
progress = (i + 1) / len(transaction_batches)
|
| 404 |
-
progress_bar.progress(progress)
|
| 405 |
-
status_text.text(f"Processing batch {i + 1} of {len(transaction_batches)}...")
|
| 406 |
-
|
| 407 |
-
batch_json = {"transactions": batch}
|
| 408 |
-
summary = generate_batch_summary(model, batch_json, start_date, end_date, statement_type, i + 1, len(transaction_batches))
|
| 409 |
-
|
| 410 |
-
if summary:
|
| 411 |
-
batch_summaries.append(summary)
|
| 412 |
-
|
| 413 |
-
progress_bar.progress(1.0)
|
| 414 |
-
status_text.text("All batches processed!")
|
| 415 |
-
|
| 416 |
-
if not batch_summaries:
|
| 417 |
-
st.error("No batch summaries were successfully generated.")
|
| 418 |
return None
|
| 419 |
-
|
| 420 |
-
# Step 3: Consolidate batch summaries
|
| 421 |
-
consolidated_data = consolidate_batch_summaries(batch_summaries, start_date, end_date, statement_type)
|
| 422 |
-
|
| 423 |
-
# Step 4: Generate final comprehensive report
|
| 424 |
-
final_report = generate_final_report(model, consolidated_data, statement_type)
|
| 425 |
-
|
| 426 |
-
return final_report
|
| 427 |
-
# Install required libraries:
|
| 428 |
-
# pip install fpdf2 beautifulsoup4 markdown
|
| 429 |
-
|
| 430 |
-
from bs4 import BeautifulSoup
|
| 431 |
-
# For logging errors/info
|
| 432 |
|
|
|
|
| 433 |
class PDF_Generator(FPDF):
|
| 434 |
-
"""
|
| 435 |
-
FPDF subclass to potentially add headers/footers later if needed.
|
| 436 |
-
Currently just a basic FPDF wrapper.
|
| 437 |
-
"""
|
| 438 |
-
def __init__(self, orientation='P', unit='mm', format='A4'):
|
| 439 |
-
super().__init__(orientation, unit, format)
|
| 440 |
-
self.set_auto_page_break(auto=True, margin=15) # Enable auto page break
|
| 441 |
-
self.set_left_margin(15)
|
| 442 |
-
self.set_right_margin(15)
|
| 443 |
-
self.alias_nb_pages() # Allows for page numbering {nb}
|
| 444 |
-
|
| 445 |
-
# Example: Add a simple footer
|
| 446 |
-
# def footer(self):
|
| 447 |
-
# self.set_y(-15) # Position 1.5 cm from bottom
|
| 448 |
-
# self.set_font('helvetica', 'I', 8)
|
| 449 |
-
# self.cell(0, 10, f'Page {self.page_no()}/{{nb}}', 0, 0, 'C')
|
| 450 |
-
|
| 451 |
def add_html_element(self, tag, styles):
|
| 452 |
-
""" Processes a single HTML tag """
|
| 453 |
text = tag.get_text()
|
| 454 |
tag_name = tag.name.lower()
|
| 455 |
-
|
| 456 |
-
# --- Basic Styling ---
|
| 457 |
current_style = ''
|
| 458 |
-
if 'b' in styles or 'strong' in styles:
|
| 459 |
-
|
| 460 |
-
if '
|
| 461 |
-
current_style += 'I'
|
| 462 |
-
# Reset font to default if no style
|
| 463 |
-
if not current_style:
|
| 464 |
-
self.set_font('helvetica', '', self.font_size_pt) # Reset to regular if needed
|
| 465 |
-
|
| 466 |
-
# --- Handle Specific Tags ---
|
| 467 |
if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 468 |
level = int(tag_name[1])
|
| 469 |
font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
|
| 470 |
-
self.set_font('helvetica', 'B', font_size)
|
| 471 |
-
self.multi_cell(0, font_size * 0.5, text, align='L')
|
| 472 |
-
self.ln(font_size * 0.3)
|
| 473 |
-
self.set_font('helvetica', '', 10)
|
| 474 |
elif tag_name == 'p':
|
| 475 |
self.set_font('helvetica', current_style, 10)
|
| 476 |
-
self.multi_cell(0, 5, text, align='L')
|
| 477 |
-
self.ln(3)
|
| 478 |
elif tag_name == 'ul':
|
| 479 |
self.ln(2)
|
| 480 |
for item in tag.find_all('li', recursive=False):
|
| 481 |
-
self.set_font('helvetica', '', 10)
|
| 482 |
-
item_text = item.get_text()
|
| 483 |
-
self.cell(5, 5, chr(127)) # Bullet point (using a circle character)
|
| 484 |
-
self.multi_cell(0, 5, item_text, align='L') # Remaining width
|
| 485 |
-
self.ln(1) # Small space between items
|
| 486 |
-
self.ln(3)
|
| 487 |
-
elif tag_name == 'ol':
|
| 488 |
-
self.ln(2)
|
| 489 |
-
for i, item in enumerate(tag.find_all('li', recursive=False), 1):
|
| 490 |
-
self.set_font('helvetica', '', 10) # Reset font for list item text
|
| 491 |
item_text = item.get_text()
|
| 492 |
-
self.cell(
|
| 493 |
self.multi_cell(0, 5, item_text, align='L')
|
| 494 |
self.ln(1)
|
| 495 |
self.ln(3)
|
|
@@ -497,387 +243,168 @@ class PDF_Generator(FPDF):
|
|
| 497 |
self.ln(5)
|
| 498 |
self.process_table(tag)
|
| 499 |
self.ln(5)
|
| 500 |
-
elif tag_name
|
| 501 |
-
# Handled by style tracking within parent elements for now
|
| 502 |
-
# Direct rendering might be needed for nested styles
|
| 503 |
-
pass # Style is applied by parent
|
| 504 |
-
elif tag_name == 'br':
|
| 505 |
-
self.ln(5) # Treat <br> as a line break
|
| 506 |
elif tag_name == 'hr':
|
| 507 |
self.ln(2)
|
| 508 |
self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
|
| 509 |
self.ln(4)
|
| 510 |
else:
|
| 511 |
-
|
| 512 |
-
if text.strip(): # Only print if there's actual text
|
| 513 |
self.set_font('helvetica', current_style, 10)
|
| 514 |
self.multi_cell(0, 5, text, align='L')
|
| 515 |
self.ln(1)
|
| 516 |
|
| 517 |
def process_table(self, table_tag):
|
| 518 |
-
""" Rudimentary table processing """
|
| 519 |
rows = table_tag.find_all('tr')
|
| 520 |
-
if not rows:
|
| 521 |
-
return
|
| 522 |
-
|
| 523 |
-
# --- Determine number of columns (use first row) ---
|
| 524 |
header_cells = rows[0].find_all(['th', 'td'])
|
| 525 |
num_cols = len(header_cells)
|
| 526 |
-
if num_cols == 0:
|
| 527 |
-
return
|
| 528 |
-
|
| 529 |
-
# --- Calculate column widths (simple equal distribution) ---
|
| 530 |
-
# Effective page width = Page width - left margin - right margin
|
| 531 |
effective_width = self.w - self.l_margin - self.r_margin
|
| 532 |
col_width = effective_width / num_cols
|
| 533 |
-
default_cell_height = 6
|
| 534 |
-
|
| 535 |
-
# --- Process Header Row ---
|
| 536 |
is_first_row = True
|
| 537 |
for row in rows:
|
| 538 |
cells = row.find_all(['th', 'td'])
|
| 539 |
-
if len(cells) != num_cols:
|
| 540 |
-
st.warning(f"Table row has inconsistent number of cells ({len(cells)} vs {num_cols}). Skipping row.")
|
| 541 |
-
continue # Skip rows with wrong number of cells
|
| 542 |
-
|
| 543 |
-
# Check page break possibility before drawing row
|
| 544 |
-
max_cell_h = default_cell_height # Start with default
|
| 545 |
-
# Estimate height needed (very basic, doesn't account for actual wrap height)
|
| 546 |
-
for cell in cells:
|
| 547 |
-
# This is a rough estimate, multi_cell calculates real height
|
| 548 |
-
pass # Cannot easily pre-calculate multi_cell height
|
| 549 |
-
|
| 550 |
-
# If estimated height exceeds remaining page space, add page
|
| 551 |
-
# Note: FPDF's auto page break handles this better with multi_cell
|
| 552 |
-
# if self.get_y() + max_cell_h > self.page_break_trigger:
|
| 553 |
-
# self.add_page()
|
| 554 |
-
|
| 555 |
is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
|
| 556 |
-
|
| 557 |
for i, cell in enumerate(cells):
|
| 558 |
cell_text = cell.get_text().strip()
|
| 559 |
if is_header_row:
|
| 560 |
-
self.set_font('helvetica', 'B', 9)
|
| 561 |
-
self.set_fill_color(230, 230, 230)
|
| 562 |
fill = True
|
| 563 |
else:
|
| 564 |
-
self.set_font('helvetica', '', 9)
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
# multi_cell automatically handles height based on content.
|
| 570 |
-
self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, ln=3) # ln=3 moves to beginning of next cell
|
| 571 |
-
|
| 572 |
-
self.ln(default_cell_height) # Move down after the row is complete (based on default height, multi_cell might make row taller)
|
| 573 |
-
is_first_row = False # Header only applies to the first row potentially
|
| 574 |
-
|
| 575 |
|
| 576 |
def create_pdf_report(report_text):
|
| 577 |
-
"""
|
| 578 |
-
Creates a PDF from markdown text locally using FPDF2.
|
| 579 |
-
|
| 580 |
-
Args:
|
| 581 |
-
report_text (str): Markdown formatted report text.
|
| 582 |
-
|
| 583 |
-
Returns:
|
| 584 |
-
BytesIO: PDF file in memory buffer.
|
| 585 |
-
|
| 586 |
-
Raises:
|
| 587 |
-
Exception: If PDF generation fails.
|
| 588 |
-
"""
|
| 589 |
if not report_text:
|
| 590 |
-
st.warning("Report text is empty, skipping PDF generation.")
|
| 591 |
raise ValueError("Input report_text cannot be empty.")
|
| 592 |
-
|
| 593 |
try:
|
| 594 |
-
st.info("Starting PDF generation from markdown report...")
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
|
| 598 |
-
cleaned_md = cleaned_md.strip()
|
| 599 |
-
# st.debug("Markdown cleaned.") # Too verbose
|
| 600 |
-
|
| 601 |
-
# 2. Convert Markdown to HTML
|
| 602 |
-
html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
|
| 603 |
-
if not html_content:
|
| 604 |
-
st.error("Markdown parsing resulted in empty HTML.") # Log
|
| 605 |
-
raise ValueError("Markdown parsing resulted in empty HTML.")
|
| 606 |
-
# st.debug("Markdown converted to HTML.") # Too verbose
|
| 607 |
-
|
| 608 |
-
# 3. Parse HTML with BeautifulSoup
|
| 609 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 610 |
-
# st.debug("HTML parsed with BeautifulSoup.") # Too verbose
|
| 611 |
-
|
| 612 |
-
# 4. Generate PDF using FPDF
|
| 613 |
pdf = PDF_Generator()
|
|
|
|
|
|
|
|
|
|
| 614 |
pdf.add_page()
|
| 615 |
-
pdf.set_font('helvetica', '', 10)
|
| 616 |
-
st.info("PDF document initialized, adding content...") # Log
|
| 617 |
-
|
| 618 |
-
# Iterate through top-level tags in the HTML body
|
| 619 |
for element in soup.find_all(recursive=False):
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
current_styles.add('b')
|
| 625 |
-
local_style_added = 'b'
|
| 626 |
-
elif tag.name in ['i', 'em']:
|
| 627 |
-
current_styles.add('i')
|
| 628 |
-
local_style_added = 'i'
|
| 629 |
-
|
| 630 |
-
if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
|
| 631 |
-
pdf.add_html_element(tag, current_styles.copy())
|
| 632 |
-
else:
|
| 633 |
-
if hasattr(tag, 'contents'):
|
| 634 |
-
for child in tag.contents:
|
| 635 |
-
if isinstance(child, str):
|
| 636 |
-
pass
|
| 637 |
-
elif hasattr(child, 'name'):
|
| 638 |
-
traverse(child, current_styles.copy())
|
| 639 |
-
|
| 640 |
-
if local_style_added and local_style_added in current_styles:
|
| 641 |
-
current_styles.remove(local_style_added)
|
| 642 |
-
|
| 643 |
-
traverse(element, styles)
|
| 644 |
-
|
| 645 |
-
st.info("Content added to PDF. Outputting PDF to buffer...") # Log
|
| 646 |
-
# 5. Output PDF to BytesIO buffer
|
| 647 |
-
pdf_output = pdf.output(dest='S') # Output as bytes string
|
| 648 |
-
if isinstance(pdf_output, str):
|
| 649 |
-
# If output is string (older fpdf versions?), encode it
|
| 650 |
-
pdf_output = pdf_output.encode('latin-1')
|
| 651 |
-
|
| 652 |
-
st.success("PDF report generated successfully.") # Log success
|
| 653 |
return BytesIO(pdf_output)
|
| 654 |
-
|
| 655 |
-
except ImportError:
|
| 656 |
-
st.error("FPDF2, BeautifulSoup4 or Markdown library not installed. Please install using: pip install fpdf2 beautifulsoup4 markdown")
|
| 657 |
-
raise Exception("Missing required libraries: FPDF2, BeautifulSoup4, Markdown") from None
|
| 658 |
except Exception as e:
|
| 659 |
-
st.error(f"Failed to generate PDF
|
| 660 |
-
st.exception(e)
|
| 661 |
-
raise
|
| 662 |
-
|
| 663 |
|
| 664 |
def main():
|
| 665 |
st.title("Quantitlytix AI")
|
| 666 |
st.markdown("*Bank Statement Parser & Financial Report Generator*")
|
| 667 |
-
st.info("Application started. Ready for user input.") # Log app start
|
| 668 |
|
| 669 |
-
# Initialize session state for last error
|
| 670 |
-
if 'last_error' not in st.session_state:
|
| 671 |
-
st.session_state['last_error'] = None
|
| 672 |
-
|
| 673 |
-
# Initialize session state for transaction date range
|
| 674 |
if 'min_date' not in st.session_state:
|
| 675 |
-
st.session_state['min_date'] = date(2024, 1, 1)
|
| 676 |
if 'max_date' not in st.session_state:
|
| 677 |
-
st.session_state['max_date'] = date(
|
|
|
|
|
|
|
| 678 |
|
| 679 |
-
# Sidebar: Select input type: Bulk PDF or CSV Upload
|
| 680 |
input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
|
| 681 |
-
st.info(f"Input type selected: {input_type}") # Log input type
|
| 682 |
-
|
| 683 |
-
all_transactions = []
|
| 684 |
|
| 685 |
if input_type == "Bulk Bank Statement Upload":
|
| 686 |
uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
|
| 687 |
if uploaded_files:
|
| 688 |
-
st.info(f"User uploaded {len(uploaded_files)} PDF file(s).")
|
| 689 |
-
|
| 690 |
-
st.
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
file_progress = (file_index) / total_files
|
| 703 |
-
progress_bar.progress(overall_progress) # Corrected variable name
|
| 704 |
-
status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
|
| 705 |
-
|
| 706 |
-
# Get PDF reader and page count
|
| 707 |
-
try:
|
| 708 |
-
pdf_reader, total_pages = read_pdf_pages(uploaded_file)
|
| 709 |
-
|
| 710 |
-
if total_pages == 0:
|
| 711 |
-
st.warning(f"No pages found in {uploaded_file.name}. Skipping file.") # Log
|
| 712 |
-
continue
|
| 713 |
-
|
| 714 |
-
with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
|
| 715 |
-
# Define progress callback for page-by-page processing
|
| 716 |
-
def update_page_progress(page_progress, message):
|
| 717 |
-
# Calculate overall progress (file progress + current file's contribution)
|
| 718 |
-
overall_progress = file_progress + (page_progress * (1/total_files))
|
| 719 |
-
progress_bar.progress(overall_progress)
|
| 720 |
-
status_text.text(f"File {file_index+1}/{total_files}: {message}")
|
| 721 |
-
|
| 722 |
-
# Process the PDF page by page
|
| 723 |
-
st.info(f"Calling process_pdf_pages for {uploaded_file.name}...") # Log
|
| 724 |
-
file_transactions = process_pdf_pages(
|
| 725 |
-
model,
|
| 726 |
-
pdf_reader,
|
| 727 |
-
total_pages,
|
| 728 |
-
progress_callback=update_page_progress
|
| 729 |
-
)
|
| 730 |
-
|
| 731 |
-
# Add transactions from this file to overall list
|
| 732 |
-
all_transactions.extend(file_transactions)
|
| 733 |
-
st.info(f"Finished processing {uploaded_file.name}. Extracted {len(file_transactions)} transactions.") # Log file completion
|
| 734 |
-
|
| 735 |
-
except Exception as e:
|
| 736 |
-
st.error(f"Error processing {uploaded_file.name}: {str(e)}") # Log specific file error
|
| 737 |
-
st.exception(e) # Show traceback
|
| 738 |
-
continue
|
| 739 |
-
|
| 740 |
-
# Complete the progress bar
|
| 741 |
-
progress_bar.progress(1.0)
|
| 742 |
-
status_text.text(f"Completed processing {total_files} files!")
|
| 743 |
-
st.success(f"All PDF files processed. Total transactions collected: {len(all_transactions)}.") # Log overall completion
|
| 744 |
-
|
| 745 |
-
except Exception as e:
|
| 746 |
-
st.error(f"Overall error during PDF document processing: {str(e)}") # Log general error during PDF handling
|
| 747 |
-
st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
|
| 748 |
-
st.exception(e) # Show traceback
|
| 749 |
elif input_type == "CSV Upload":
|
| 750 |
uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
|
| 751 |
if uploaded_csv:
|
| 752 |
-
st.info(f"User uploaded CSV file: {uploaded_csv.name}.")
|
| 753 |
-
|
| 754 |
-
df = pd.read_csv(uploaded_csv)
|
| 755 |
-
# Drop 'Unnamed:' columns from the uploaded CSV
|
| 756 |
-
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
|
| 757 |
-
st.write("CSV Data Preview:")
|
| 758 |
-
st.dataframe(df.head())
|
| 759 |
-
# Convert dataframe to list of transaction dictionaries
|
| 760 |
-
transactions = df.to_dict(orient='records')
|
| 761 |
-
all_transactions.extend(transactions)
|
| 762 |
-
st.success(f"Successfully loaded {len(transactions)} transactions from CSV.") # Log
|
| 763 |
-
except Exception as e:
|
| 764 |
-
st.error(f"Error processing CSV file: {str(e)}") # Log CSV error
|
| 765 |
-
st.exception(e)
|
| 766 |
-
|
| 767 |
-
# If transactions are loaded, show DataFrame and update date ranges
|
| 768 |
-
if all_transactions:
|
| 769 |
-
st.info("Consolidating and displaying all extracted transactions.") # Log
|
| 770 |
-
df = pd.DataFrame(all_transactions)
|
| 771 |
-
# Drop 'Unnamed:' columns from the final DataFrame
|
| 772 |
-
if not df.empty:
|
| 773 |
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
st.warning("Could not determine valid date range from transactions. Using default dates.") # Log
|
| 790 |
-
|
| 791 |
-
# Format dates for display
|
| 792 |
-
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
|
| 793 |
-
|
| 794 |
-
except Exception as e:
|
| 795 |
-
st.warning("Some transaction dates could not be formatted correctly.")
|
| 796 |
-
st.exception(e)
|
| 797 |
-
|
| 798 |
-
st.success("Transactions loaded successfully!")
|
| 799 |
-
st.write("### Extracted Transactions")
|
| 800 |
-
st.dataframe(df)
|
| 801 |
-
else:
|
| 802 |
-
st.warning("No valid transactions could be extracted from the documents.")
|
| 803 |
else:
|
| 804 |
-
st.info("No transactions loaded yet. Upload files to begin.")
|
| 805 |
|
| 806 |
-
# Financial report generation parameters
|
| 807 |
st.write("### Generate Financial Report")
|
| 808 |
col1, col2 = st.columns(2)
|
| 809 |
with col1:
|
| 810 |
-
# Use the min_date from transactions if available, otherwise use default
|
| 811 |
start_date = st.date_input("Start Date", st.session_state['min_date'])
|
| 812 |
with col2:
|
| 813 |
-
# Use the max_date from transactions if available, otherwise use default
|
| 814 |
end_date = st.date_input("End Date", st.session_state['max_date'])
|
| 815 |
-
|
| 816 |
statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
|
| 817 |
|
| 818 |
if st.button("Generate Financial Report"):
|
| 819 |
-
st.
|
| 820 |
-
if not all_transactions:
|
| 821 |
st.error("No transactions available to generate report. Please upload files first.")
|
| 822 |
else:
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
filtered_transactions.append(transaction)
|
| 831 |
-
except (ValueError, TypeError):
|
| 832 |
-
st.warning(f"Could not parse date for transaction, skipping: {transaction}")
|
| 833 |
-
continue
|
| 834 |
-
|
| 835 |
-
if not filtered_transactions:
|
| 836 |
-
st.warning("No transactions found within the selected date range. Please adjust dates or upload relevant files.")
|
| 837 |
else:
|
| 838 |
-
st.info(f"Found {len(
|
|
|
|
| 839 |
try:
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
)
|
| 849 |
-
|
| 850 |
-
st.info("Using standard processing for smaller dataset...")
|
| 851 |
-
combined_json = {"transactions": filtered_transactions}
|
| 852 |
-
with st.spinner("Generating financial report..."):
|
| 853 |
-
report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
|
| 854 |
-
|
| 855 |
-
if report_text:
|
| 856 |
-
st.success("Financial report generated successfully!")
|
| 857 |
-
|
| 858 |
-
# Display the report as markdown
|
| 859 |
-
st.markdown("### Financial Report Preview")
|
| 860 |
-
st.markdown(report_text)
|
| 861 |
-
|
| 862 |
-
# Create PDF from markdown
|
| 863 |
-
try:
|
| 864 |
-
st.info("Generating PDF from the report...")
|
| 865 |
pdf_buffer = create_pdf_report(report_text)
|
| 866 |
st.download_button(
|
| 867 |
label="Download Financial Report as PDF",
|
| 868 |
-
data=pdf_buffer
|
| 869 |
file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
|
| 870 |
mime="application/pdf"
|
| 871 |
)
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
st.error(f"Error generating PDF: {str(e)}")
|
| 875 |
-
st.exception(e)
|
| 876 |
-
|
| 877 |
except Exception as e:
|
| 878 |
-
st.error(f"
|
| 879 |
-
if "504" in str(e):
|
| 880 |
-
st.info("Consider using a smaller date range or fewer transactions.")
|
| 881 |
st.exception(e)
|
| 882 |
|
| 883 |
if __name__ == "__main__":
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
+
from datetime import datetime, date
|
| 6 |
from io import BytesIO
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
import streamlit as st
|
| 9 |
import google.generativeai as genai
|
| 10 |
import pypdf
|
| 11 |
from fpdf import FPDF
|
|
|
|
|
|
|
| 12 |
from google.api_core import exceptions
|
| 13 |
+
import markdown
|
| 14 |
+
from bs4 import BeautifulSoup
|
| 15 |
|
| 16 |
+
# Configure API key for Gemini - Ensure this is set in your environment variables
|
| 17 |
api_key = os.getenv('Gemini')
|
| 18 |
|
| 19 |
def configure_gemini(api_key):
|
| 20 |
+
"""
|
| 21 |
+
Configures the Gemini model for transaction extraction as specified by the user.
|
| 22 |
+
"""
|
| 23 |
+
st.info("Configuring Gemini API for transaction extraction...")
|
| 24 |
genai.configure(api_key=api_key)
|
| 25 |
+
# Using the model specified by the user for this task
|
| 26 |
return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
|
| 27 |
|
| 28 |
def configure_gemini1(api_key):
|
| 29 |
+
"""
|
| 30 |
+
Configures the Gemini model for report generation as specified by the user.
|
| 31 |
+
"""
|
| 32 |
+
st.info("Configuring Gemini API for report generation...")
|
| 33 |
genai.configure(api_key=api_key)
|
| 34 |
+
# Using the state-of-the-art model for high-quality report formatting
|
| 35 |
return genai.GenerativeModel('gemini-2.5-pro')
|
| 36 |
+
|
|
|
|
| 37 |
def read_pdf_pages(file_obj):
|
| 38 |
+
st.info(f"Reading PDF pages from {file_obj.name}...")
|
| 39 |
+
file_obj.seek(0)
|
| 40 |
pdf_reader = pypdf.PdfReader(file_obj)
|
| 41 |
total_pages = len(pdf_reader.pages)
|
| 42 |
+
st.info(f"Found {total_pages} pages in PDF.")
|
| 43 |
return pdf_reader, total_pages
|
| 44 |
|
|
|
|
| 45 |
def extract_page_text(pdf_reader, page_num):
|
|
|
|
| 46 |
if page_num < len(pdf_reader.pages):
|
| 47 |
text = pdf_reader.pages[page_num].extract_text()
|
| 48 |
+
if not text or not text.strip():
|
| 49 |
+
st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.")
|
| 50 |
return text if text else ""
|
| 51 |
return ""
|
| 52 |
|
|
|
|
| 53 |
def process_with_gemini(model, text):
|
| 54 |
prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
|
| 55 |
- Date (format DD/MM/YYYY)
|
|
|
|
| 76 |
]
|
| 77 |
}"""
|
| 78 |
try:
|
|
|
|
| 79 |
response = model.generate_content([prompt, text])
|
| 80 |
+
time.sleep(6) # Retaining original sleep time as per user's working code
|
|
|
|
| 81 |
return response.text
|
| 82 |
+
except exceptions.GoogleAPICallError as e:
|
| 83 |
+
st.error(f"A Google API call error occurred during transaction extraction: {e}")
|
| 84 |
+
if "context length" in str(e):
|
| 85 |
+
st.warning("The text on a single PDF page may be too long for the extraction model.")
|
| 86 |
+
return None
|
|
|
|
|
|
|
| 87 |
except Exception as e:
|
| 88 |
+
st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}")
|
| 89 |
return None
|
| 90 |
|
|
|
|
| 91 |
def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
| 92 |
all_transactions = []
|
| 93 |
+
st.info(f"Starting page-by-page PDF processing for {total_pages} pages...")
|
| 94 |
+
|
|
|
|
| 95 |
for page_num in range(total_pages):
|
|
|
|
| 96 |
if progress_callback:
|
| 97 |
progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
|
| 98 |
+
|
|
|
|
| 99 |
page_text = extract_page_text(pdf_reader, page_num)
|
|
|
|
| 100 |
if not page_text.strip():
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...")
|
|
|
|
|
|
|
| 104 |
json_response = process_with_gemini(model, page_text)
|
| 105 |
+
|
| 106 |
if json_response:
|
| 107 |
+
# A more robust regex to find the JSON block
|
| 108 |
+
match = re.search(r'\{.*\}', json_response, re.DOTALL)
|
| 109 |
+
if not match:
|
| 110 |
+
st.warning(f"No valid JSON object found in Gemini response for page {page_num + 1}.")
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
+
json_str = match.group(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
try:
|
| 115 |
data = json.loads(json_str)
|
| 116 |
transactions = data.get('transactions', [])
|
| 117 |
if transactions:
|
| 118 |
+
st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.")
|
| 119 |
all_transactions.extend(transactions)
|
|
|
|
|
|
|
| 120 |
except json.JSONDecodeError:
|
| 121 |
+
st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}.")
|
| 122 |
+
continue
|
| 123 |
else:
|
| 124 |
+
st.warning(f"Gemini returned no response for page {page_num + 1}.")
|
| 125 |
|
| 126 |
+
st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.")
|
| 127 |
return all_transactions
|
| 128 |
|
| 129 |
+
def aggregate_financial_data(transactions: list, statement_type: str):
|
| 130 |
+
"""
|
| 131 |
+
Aggregates transaction data using Pandas for high performance and accuracy.
|
| 132 |
+
This function does the heavy lifting locally, preparing a small summary for the LLM.
|
| 133 |
+
"""
|
| 134 |
+
st.info(f"Performing local financial aggregation for {len(transactions)} transactions...")
|
| 135 |
+
if not transactions:
|
| 136 |
+
st.warning("No transactions to aggregate.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
return None
|
| 138 |
|
| 139 |
+
df = pd.DataFrame(transactions)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
# --- Data Cleaning and Preparation ---
|
| 142 |
+
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0)
|
| 143 |
+
df['Type'] = df['Type'].str.lower()
|
| 144 |
+
|
| 145 |
+
# --- Core Financial Calculations ---
|
| 146 |
+
total_income = df[df['Type'] == 'income']['Amount'].sum()
|
| 147 |
+
total_expenses = df[df['Type'] == 'expense']['Amount'].sum()
|
| 148 |
+
net_position = total_income - total_expenses
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
+
# --- Build the Aggregated Data Structure ---
|
| 151 |
+
aggregated_data = {
|
| 152 |
+
"total_income": total_income,
|
| 153 |
+
"total_expenses": total_expenses,
|
| 154 |
+
"net_position": net_position,
|
| 155 |
+
"transaction_count": len(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
# --- Statement-Specific Aggregations ---
|
| 159 |
+
if statement_type == "Income Statement":
|
| 160 |
+
expense_breakdown = df[df['Type'] == 'expense'].groupby('Category_of_expense')['Amount'].sum().round(2).to_dict()
|
| 161 |
+
aggregated_data["expense_breakdown"] = expense_breakdown
|
| 162 |
+
income_breakdown = df[df['Type'] == 'income'].groupby('Customer_name')['Amount'].sum().round(2).to_dict()
|
| 163 |
+
aggregated_data["income_breakdown"] = income_breakdown
|
| 164 |
+
elif statement_type == "Cashflow Statement":
|
| 165 |
+
aggregated_data["operating_cash_flow"] = net_position
|
| 166 |
+
aggregated_data["cash_inflows"] = total_income
|
| 167 |
+
aggregated_data["cash_outflows"] = total_expenses
|
| 168 |
+
elif statement_type == "Balance Sheet":
|
| 169 |
+
aggregated_data["notes"] = "Balance Sheets require asset and liability balances, not just transaction flows. This data can only show the net change in cash over the period."
|
| 170 |
+
|
| 171 |
+
st.success("Local financial aggregation complete.")
|
| 172 |
+
return aggregated_data
|
| 173 |
+
|
| 174 |
+
def generate_financial_report(model, aggregated_data, start_date, end_date, statement_type):
|
| 175 |
+
"""
|
| 176 |
+
Generates a financial report by sending a small, pre-aggregated summary to the LLM.
|
| 177 |
+
The LLM's job is to format this data professionally, not to calculate it.
|
| 178 |
+
"""
|
| 179 |
+
st.info(f"Preparing to generate {statement_type} with pre-aggregated data...")
|
| 180 |
+
prompt = f"""
|
| 181 |
+
Based on the following pre-aggregated financial summary JSON data:
|
| 182 |
+
{json.dumps(aggregated_data, indent=2)}
|
| 183 |
+
|
| 184 |
+
Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
|
| 185 |
+
|
| 186 |
+
Specific Formatting and Content Requirements:
|
| 187 |
+
|
| 188 |
+
Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
|
| 189 |
+
Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
|
| 190 |
+
Consistent Formatting: Maintain consistent formatting for monetary values (using "R" for South African Rand), dates, and alignment.
|
| 191 |
+
Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate.
|
| 192 |
+
Descriptive Line Items: Use the provided aggregated data to create clear line items.
|
| 193 |
+
Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends or key performance indicators derived from the provided summary data.
|
| 194 |
+
Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture.
|
| 195 |
+
Special Case for Balance Sheet: If the request is for a "Balance Sheet," explain professionally that a balance sheet cannot be generated from transaction data alone, as it requires a snapshot of assets, liabilities, and equity. Then, present the available cash flow information as a helpful alternative.
|
| 196 |
+
|
| 197 |
+
Format the entire report in Markdown for better visual structure.
|
| 198 |
+
Do not name the company if a name is not there; refer to it as "The Business". Return just the report and nothing else.
|
| 199 |
+
"""
|
| 200 |
try:
|
| 201 |
+
st.info("Sending request to Gemini for final report formatting...")
|
| 202 |
response = model.generate_content([prompt])
|
| 203 |
+
time.sleep(7) # Retaining original sleep time
|
| 204 |
+
st.success("Successfully received formatted financial report from Gemini.")
|
| 205 |
return response.text
|
| 206 |
+
except exceptions.GoogleAPICallError as e:
|
| 207 |
+
st.error(f"A Google API call error occurred during report generation: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
return None
|
| 209 |
+
except Exception as e:
|
| 210 |
+
st.error(f"An unexpected error occurred during Gemini report generation: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
+
# --- PDF Generation Logic (Unaltered as per your request) ---
|
| 214 |
class PDF_Generator(FPDF):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def add_html_element(self, tag, styles):
|
|
|
|
| 216 |
text = tag.get_text()
|
| 217 |
tag_name = tag.name.lower()
|
|
|
|
|
|
|
| 218 |
current_style = ''
|
| 219 |
+
if 'b' in styles or 'strong' in styles: current_style += 'B'
|
| 220 |
+
if 'i' in styles or 'em' in styles: current_style += 'I'
|
| 221 |
+
if not current_style: self.set_font('helvetica', '', self.font_size_pt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 223 |
level = int(tag_name[1])
|
| 224 |
font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
|
| 225 |
+
self.set_font('helvetica', 'B', font_size)
|
| 226 |
+
self.multi_cell(0, font_size * 0.5, text, align='L')
|
| 227 |
+
self.ln(font_size * 0.3)
|
| 228 |
+
self.set_font('helvetica', '', 10)
|
| 229 |
elif tag_name == 'p':
|
| 230 |
self.set_font('helvetica', current_style, 10)
|
| 231 |
+
self.multi_cell(0, 5, text, align='L')
|
| 232 |
+
self.ln(3)
|
| 233 |
elif tag_name == 'ul':
|
| 234 |
self.ln(2)
|
| 235 |
for item in tag.find_all('li', recursive=False):
|
| 236 |
+
self.set_font('helvetica', '', 10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
item_text = item.get_text()
|
| 238 |
+
self.cell(5, 5, chr(127))
|
| 239 |
self.multi_cell(0, 5, item_text, align='L')
|
| 240 |
self.ln(1)
|
| 241 |
self.ln(3)
|
|
|
|
| 243 |
self.ln(5)
|
| 244 |
self.process_table(tag)
|
| 245 |
self.ln(5)
|
| 246 |
+
elif tag_name == 'br': self.ln(5)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
elif tag_name == 'hr':
|
| 248 |
self.ln(2)
|
| 249 |
self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
|
| 250 |
self.ln(4)
|
| 251 |
else:
|
| 252 |
+
if text.strip():
|
|
|
|
| 253 |
self.set_font('helvetica', current_style, 10)
|
| 254 |
self.multi_cell(0, 5, text, align='L')
|
| 255 |
self.ln(1)
|
| 256 |
|
| 257 |
def process_table(self, table_tag):
|
|
|
|
| 258 |
rows = table_tag.find_all('tr')
|
| 259 |
+
if not rows: return
|
|
|
|
|
|
|
|
|
|
| 260 |
header_cells = rows[0].find_all(['th', 'td'])
|
| 261 |
num_cols = len(header_cells)
|
| 262 |
+
if num_cols == 0: return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
effective_width = self.w - self.l_margin - self.r_margin
|
| 264 |
col_width = effective_width / num_cols
|
| 265 |
+
default_cell_height = 6
|
|
|
|
|
|
|
| 266 |
is_first_row = True
|
| 267 |
for row in rows:
|
| 268 |
cells = row.find_all(['th', 'td'])
|
| 269 |
+
if len(cells) != num_cols: continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
|
|
|
|
| 271 |
for i, cell in enumerate(cells):
|
| 272 |
cell_text = cell.get_text().strip()
|
| 273 |
if is_header_row:
|
| 274 |
+
self.set_font('helvetica', 'B', 9)
|
| 275 |
+
self.set_fill_color(230, 230, 230)
|
| 276 |
fill = True
|
| 277 |
else:
|
| 278 |
+
self.set_font('helvetica', '', 9)
|
| 279 |
+
fill = False
|
| 280 |
+
self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, new_x="RIGHT", new_y="TOP")
|
| 281 |
+
self.ln(default_cell_height)
|
| 282 |
+
is_first_row = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
def create_pdf_report(report_text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
if not report_text:
|
| 286 |
+
st.warning("Report text is empty, skipping PDF generation.")
|
| 287 |
raise ValueError("Input report_text cannot be empty.")
|
|
|
|
| 288 |
try:
|
| 289 |
+
st.info("Starting PDF generation from markdown report...")
|
| 290 |
+
cleaned_md = re.sub(r'```markdown|```', '', report_text, flags=re.MULTILINE).strip()
|
| 291 |
+
html_content = markdown.markdown(cleaned_md, extensions=['tables'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
|
|
|
|
|
| 293 |
pdf = PDF_Generator()
|
| 294 |
+
pdf.set_auto_page_break(auto=True, margin=15)
|
| 295 |
+
pdf.set_left_margin(15)
|
| 296 |
+
pdf.set_right_margin(15)
|
| 297 |
pdf.add_page()
|
| 298 |
+
pdf.set_font('helvetica', '', 10)
|
|
|
|
|
|
|
|
|
|
| 299 |
for element in soup.find_all(recursive=False):
|
| 300 |
+
pdf.add_html_element(element, set())
|
| 301 |
+
st.info("Content added to PDF. Outputting PDF to buffer...")
|
| 302 |
+
pdf_output = pdf.output(dest='S').encode('latin-1')
|
| 303 |
+
st.success("PDF report generated successfully.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
return BytesIO(pdf_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
except Exception as e:
|
| 306 |
+
st.error(f"Failed to generate PDF: {e}")
|
| 307 |
+
st.exception(e)
|
| 308 |
+
raise
|
|
|
|
| 309 |
|
| 310 |
def main():
|
| 311 |
st.title("Quantitlytix AI")
|
| 312 |
st.markdown("*Bank Statement Parser & Financial Report Generator*")
|
|
|
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
if 'min_date' not in st.session_state:
|
| 315 |
+
st.session_state['min_date'] = date(2024, 1, 1)
|
| 316 |
if 'max_date' not in st.session_state:
|
| 317 |
+
st.session_state['max_date'] = date.today()
|
| 318 |
+
if 'transactions' not in st.session_state:
|
| 319 |
+
st.session_state['transactions'] = []
|
| 320 |
|
|
|
|
| 321 |
input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
if input_type == "Bulk Bank Statement Upload":
|
| 324 |
uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
|
| 325 |
if uploaded_files:
|
| 326 |
+
st.info(f"User uploaded {len(uploaded_files)} PDF file(s).")
|
| 327 |
+
model = configure_gemini(api_key)
|
| 328 |
+
progress_bar = st.progress(0)
|
| 329 |
+
all_transactions = []
|
| 330 |
+
for i, file in enumerate(uploaded_files):
|
| 331 |
+
st.text(f"Processing {file.name}...")
|
| 332 |
+
pdf_reader, total_pages = read_pdf_pages(file)
|
| 333 |
+
if total_pages > 0:
|
| 334 |
+
file_transactions = process_pdf_pages(model, pdf_reader, total_pages)
|
| 335 |
+
all_transactions.extend(file_transactions)
|
| 336 |
+
progress_bar.progress((i + 1) / len(uploaded_files))
|
| 337 |
+
st.session_state['transactions'] = all_transactions
|
| 338 |
+
st.success(f"All PDF files processed. Total transactions collected: {len(st.session_state['transactions'])}.")
|
| 339 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
elif input_type == "CSV Upload":
|
| 341 |
uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
|
| 342 |
if uploaded_csv:
|
| 343 |
+
st.info(f"User uploaded CSV file: {uploaded_csv.name}.")
|
| 344 |
+
df = pd.read_csv(uploaded_csv)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
|
| 346 |
+
st.session_state['transactions'] = df.to_dict(orient='records')
|
| 347 |
+
st.success(f"Successfully loaded {len(st.session_state['transactions'])} transactions from CSV.")
|
| 348 |
+
|
| 349 |
+
if st.session_state['transactions']:
|
| 350 |
+
st.info("Consolidating and displaying all extracted transactions.")
|
| 351 |
+
df = pd.DataFrame(st.session_state['transactions'])
|
| 352 |
+
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
|
| 353 |
+
df.dropna(subset=['Date'], inplace=True)
|
| 354 |
+
if not df.empty:
|
| 355 |
+
min_date = df['Date'].min().date()
|
| 356 |
+
max_date = df['Date'].max().date()
|
| 357 |
+
st.session_state['min_date'] = min_date
|
| 358 |
+
st.session_state['max_date'] = max_date
|
| 359 |
+
st.write("### Extracted Transactions")
|
| 360 |
+
st.dataframe(df.astype(str))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 361 |
else:
|
| 362 |
+
st.info("No transactions loaded yet. Upload files to begin.")
|
| 363 |
|
|
|
|
| 364 |
st.write("### Generate Financial Report")
|
| 365 |
col1, col2 = st.columns(2)
|
| 366 |
with col1:
|
|
|
|
| 367 |
start_date = st.date_input("Start Date", st.session_state['min_date'])
|
| 368 |
with col2:
|
|
|
|
| 369 |
end_date = st.date_input("End Date", st.session_state['max_date'])
|
|
|
|
| 370 |
statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
|
| 371 |
|
| 372 |
if st.button("Generate Financial Report"):
|
| 373 |
+
if not st.session_state['transactions']:
|
|
|
|
| 374 |
st.error("No transactions available to generate report. Please upload files first.")
|
| 375 |
else:
|
| 376 |
+
df = pd.DataFrame(st.session_state['transactions'])
|
| 377 |
+
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
|
| 378 |
+
mask = (df['Date'] >= pd.to_datetime(start_date)) & (df['Date'] <= pd.to_datetime(end_date))
|
| 379 |
+
filtered_df = df.loc[mask]
|
| 380 |
+
|
| 381 |
+
if filtered_df.empty:
|
| 382 |
+
st.warning("No transactions found within the selected date range.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
else:
|
| 384 |
+
st.info(f"Found {len(filtered_df)} transactions within the selected date range.")
|
| 385 |
+
filtered_transactions_list = filtered_df.to_dict(orient='records')
|
| 386 |
try:
|
| 387 |
+
with st.spinner("Aggregating financial data locally..."):
|
| 388 |
+
aggregated_summary = aggregate_financial_data(filtered_transactions_list, statement_type)
|
| 389 |
+
if aggregated_summary:
|
| 390 |
+
with st.spinner("Generating formatted report with Gemini..."):
|
| 391 |
+
model1 = configure_gemini1(api_key)
|
| 392 |
+
report_text = generate_financial_report(model1, aggregated_summary, start_date, end_date, statement_type)
|
| 393 |
+
if report_text:
|
| 394 |
+
st.success("Financial report generated successfully!")
|
| 395 |
+
st.markdown("### Financial Report Preview")
|
| 396 |
+
st.markdown(report_text, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
pdf_buffer = create_pdf_report(report_text)
|
| 398 |
st.download_button(
|
| 399 |
label="Download Financial Report as PDF",
|
| 400 |
+
data=pdf_buffer,
|
| 401 |
file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
|
| 402 |
mime="application/pdf"
|
| 403 |
)
|
| 404 |
+
else:
|
| 405 |
+
st.error("Failed to generate the financial report from the aggregated data.")
|
|
|
|
|
|
|
|
|
|
| 406 |
except Exception as e:
|
| 407 |
+
st.error(f"An unexpected error occurred during the report generation process: {e}")
|
|
|
|
|
|
|
| 408 |
st.exception(e)
|
| 409 |
|
| 410 |
if __name__ == "__main__":
|