rairo commited on
Commit
ec735f4
·
verified ·
1 Parent(s): 0f5777e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +218 -691
app.py CHANGED
@@ -2,52 +2,54 @@ import re
2
  import json
3
  import os
4
  import time
5
- from datetime import datetime, date, timedelta
6
  from io import BytesIO
7
- import requests
8
  import pandas as pd
9
  import streamlit as st
10
  import google.generativeai as genai
11
  import pypdf
12
  from fpdf import FPDF
13
- from fpdf.enums import XPos, YPos
14
- import markdown
15
  from google.api_core import exceptions
16
- from html_to_markdown import convert_to_markdown
 
17
 
18
- # Configure API key for Gemini
19
  api_key = os.getenv('Gemini')
20
 
21
  def configure_gemini(api_key):
22
- st.info("Configuring Gemini API for transaction extraction...") # Log
 
 
 
23
  genai.configure(api_key=api_key)
 
24
  return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
25
 
26
  def configure_gemini1(api_key):
27
- st.info("Configuring Gemini API for report generation...") # Log
 
 
 
28
  genai.configure(api_key=api_key)
 
29
  return genai.GenerativeModel('gemini-2.5-pro')
30
-
31
- # Read PDF content page by page from a file-like object
32
  def read_pdf_pages(file_obj):
33
- st.info(f"Reading PDF pages from {file_obj.name}...") # Log
34
- file_obj.seek(0) # Ensure the file pointer is at the start
35
  pdf_reader = pypdf.PdfReader(file_obj)
36
  total_pages = len(pdf_reader.pages)
37
- st.info(f"Found {total_pages} pages in PDF.") # Log
38
  return pdf_reader, total_pages
39
 
40
- # Extract text from a specific page
41
  def extract_page_text(pdf_reader, page_num):
42
- # st.debug(f"Extracting text from page {page_num + 1}...") # Too verbose for general logging
43
  if page_num < len(pdf_reader.pages):
44
  text = pdf_reader.pages[page_num].extract_text()
45
- if not text.strip():
46
- st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.") # Log empty pages
47
  return text if text else ""
48
  return ""
49
 
50
- # Process a chunk of PDF text with Gemini to extract transactions as JSON
51
  def process_with_gemini(model, text):
52
  prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
53
  - Date (format DD/MM/YYYY)
@@ -74,422 +76,166 @@ def process_with_gemini(model, text):
74
  ]
75
  }"""
76
  try:
77
- # st.debug("Sending text chunk to Gemini for transaction extraction...") # Too verbose
78
  response = model.generate_content([prompt, text])
79
- time.sleep(6) # Sleep for 6 seconds to work around rate limit
80
- # st.debug("Received response from Gemini for transaction extraction.") # Too verbose
81
  return response.text
82
- except exceptions.ServiceUnavailable as e:
83
- if e.response.status_code == 504:
84
- st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
85
- return None
86
- else:
87
- st.error(f"Gemini API error during transaction extraction: {e}") # Log other API errors
88
- raise
89
  except Exception as e:
90
- st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}") # Catch other potential errors
91
  return None
92
 
93
- # Process PDF page by page to handle large files
94
  def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
95
  all_transactions = []
96
- st.info(f"Starting page-by-page PDF processing for {total_pages} pages...") # Log
97
-
98
- # Process pages individually or in small chunks
99
  for page_num in range(total_pages):
100
- # Update progress if callback provided
101
  if progress_callback:
102
  progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
103
-
104
- # Extract text from current page
105
  page_text = extract_page_text(pdf_reader, page_num)
106
-
107
  if not page_text.strip():
108
- st.warning(f"Skipping empty or unreadable page {page_num + 1}.") # Log skipped pages
109
- continue # Skip empty pages
110
-
111
- # Process the page with Gemini
112
- st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...") # Log
113
  json_response = process_with_gemini(model, page_text)
114
-
115
  if json_response:
116
- # Extract JSON from response
117
- start_idx = json_response.find('{')
118
- end_idx = json_response.rfind('}') + 1
119
-
120
- if start_idx == -1 or end_idx == 0:
121
- st.warning(f"No valid JSON found in Gemini response for page {page_num + 1}. Raw response: {json_response[:200]}...") # Log invalid JSON structure
122
- continue # Skip invalid JSON
123
-
124
- json_str = json_response[start_idx:end_idx]
125
- json_str = json_str.replace('```json', '').replace('```', '')
126
-
127
  try:
128
  data = json.loads(json_str)
129
  transactions = data.get('transactions', [])
130
  if transactions:
131
- st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.") # Log successful extraction
132
  all_transactions.extend(transactions)
133
- else:
134
- st.info(f"No transactions found on page {page_num + 1} based on Gemini's analysis.") # Log no transactions found on page
135
  except json.JSONDecodeError:
136
- st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}. Check response format. Raw JSON snippet: {json_str[:200]}...") # Log JSON decode errors
137
- continue # Skip invalid JSON
138
  else:
139
- st.warning(f"Gemini returned no response for page {page_num + 1}. This page's transactions might be missing.") # Log no response from Gemini
140
 
141
- st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.") # Final log for extraction
142
  return all_transactions
143
 
144
- # Generate financial report from aggregated JSON transactions and chosen parameters
145
- def generate_financial_report(model, json_data, start_date, end_date, statement_type):
146
- st.info(f"Preparing prompt for Gemini to generate {statement_type} report from {start_date} to {end_date}...") # Log
147
- prompt = f"""Based on the following transactions JSON data:
148
- {json.dumps(json_data)}
149
- Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
150
-
151
- Specific Formatting and Content Requirements:
152
-
153
- Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
154
- Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
155
- Consistent Formatting: Maintain consistent formatting for monetary values (e.g., using "R" for South African Rand if applicable and discernible from the data, comma separators for thousands), dates, and alignment.
156
- Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate to provide a clear understanding of the financial performance or position.
157
- Descriptive Line Items: Use clear and concise descriptions for each transaction or aggregated account based on the provided JSON data.
158
- Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends, notable figures, or key performance indicators derived from the data within the statement. This should be written in plain, understandable English, potentially highlighting aspects particularly relevant to the economic context of Zimbabwe if discernible from the data.
159
- Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture presented in the {statement_type}.
160
-
161
- Format the report in Markdown for better visual structure.
162
- Do not name the company if name is not there and return just the report and nothing else."""
163
- try:
164
- st.info("Sending request to Gemini for financial report generation...") # Log
165
- response = model.generate_content([prompt])
166
- time.sleep(7) # Sleep for 7 seconds to work around rate limit
167
- st.success("Successfully received financial report from Gemini.") # Log success
168
- return response.text
169
- except exceptions.ServiceUnavailable as e:
170
- if e.response.status_code == 504:
171
- st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
172
- st.session_state['last_error'] = "504" # Store the error in session state
173
- return None
174
- else:
175
- st.error(f"Gemini API error during report generation: {e}") # Log other API errors
176
- raise
177
- except Exception as e:
178
- st.error(f"An unexpected error occurred during Gemini report generation: {e}") # Catch other potential errors
179
  return None
180
 
181
- def chunk_transactions(transactions, batch_size=400):
182
- """Split transactions into smaller batches for processing."""
183
- batches = []
184
- for i in range(0, len(transactions), batch_size):
185
- batch = transactions[i:i + batch_size]
186
- batches.append(batch)
187
- st.info(f"Split {len(transactions)} transactions into {len(batches)} batches of up to {batch_size} transactions each.")
188
- return batches
189
-
190
- def generate_batch_summary(model, json_data, start_date, end_date, statement_type, batch_num, total_batches):
191
- """Generate a summary analysis for a batch of transactions."""
192
- st.info(f"Processing batch {batch_num}/{total_batches} with {len(json_data['transactions'])} transactions...")
193
-
194
- prompt = f"""Analyze this batch of transactions (batch {batch_num} of {total_batches}) for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}.
195
-
196
- Transaction data:
197
- {json.dumps(json_data)}
198
-
199
- Create a structured summary focusing on aggregation and categorization. Return ONLY the following JSON structure:
200
-
201
- {{
202
- "batch_info": {{
203
- "batch_number": {batch_num},
204
- "total_batches": {total_batches},
205
- "transaction_count": {len(json_data['transactions'])},
206
- "date_range": "{start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}"
207
- }},
208
- "financial_summary": {{
209
- "total_income": 0,
210
- "total_expenses": 0,
211
- "net_position": 0
212
- }},
213
- "income_breakdown": {{
214
- "by_customer": {{}},
215
- "by_month": {{}}
216
- }},
217
- "expense_breakdown": {{
218
- "by_category": {{}},
219
- "by_month": {{}}
220
- }},
221
- "key_transactions": [
222
- // Top 5 largest transactions (income and expense)
223
- ],
224
- "monthly_totals": {{
225
- // Format: "YYYY-MM": {{"income": 0, "expenses": 0, "net": 0}}
226
- }}
227
- }}
228
-
229
- Focus on numerical aggregation and categorization. Be precise with calculations."""
230
 
231
- try:
232
- response = model.generate_content([prompt])
233
- time.sleep(4)
234
- return response.text
235
- except exceptions.ServiceUnavailable as e:
236
- if e.response.status_code == 504:
237
- st.error(f"Batch {batch_num} timed out. Skipping this batch.")
238
- return None
239
- else:
240
- st.error(f"API error processing batch {batch_num}: {e}")
241
- raise
242
- except Exception as e:
243
- st.error(f"Error processing batch {batch_num}: {e}")
244
- return None
245
 
246
- def consolidate_batch_summaries(batch_summaries, start_date, end_date, statement_type):
247
- """Combine multiple batch summaries into aggregated data structure."""
248
- st.info(f"Consolidating {len(batch_summaries)} batch summaries...")
249
-
250
- consolidated = {
251
- "total_batches": len(batch_summaries),
252
- "total_transactions": 0,
253
- "date_range": f"{start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}",
254
- "financial_summary": {
255
- "total_income": 0,
256
- "total_expenses": 0,
257
- "net_position": 0
258
- },
259
- "income_breakdown": {
260
- "by_customer": {},
261
- "by_month": {}
262
- },
263
- "expense_breakdown": {
264
- "by_category": {},
265
- "by_month": {}
266
- },
267
- "key_transactions": [],
268
- "monthly_totals": {}
269
  }
270
-
271
- # Process each batch summary
272
- for batch_data in batch_summaries:
273
- if not batch_data:
274
- continue
275
-
276
- try:
277
- # Extract JSON from response if needed
278
- if isinstance(batch_data, str):
279
- start_idx = batch_data.find('{')
280
- end_idx = batch_data.rfind('}') + 1
281
- if start_idx != -1 and end_idx > start_idx:
282
- json_str = batch_data[start_idx:end_idx]
283
- batch_data = json.loads(json_str)
284
- else:
285
- st.warning("Could not extract JSON from batch summary")
286
- continue
287
-
288
- # Aggregate financial summary
289
- if 'financial_summary' in batch_data:
290
- fs = batch_data['financial_summary']
291
- consolidated['financial_summary']['total_income'] += fs.get('total_income', 0)
292
- consolidated['financial_summary']['total_expenses'] += fs.get('total_expenses', 0)
293
-
294
- # Aggregate transaction count
295
- if 'batch_info' in batch_data:
296
- consolidated['total_transactions'] += batch_data['batch_info'].get('transaction_count', 0)
297
-
298
- # Merge income breakdown by customer
299
- if 'income_breakdown' in batch_data:
300
- for customer, amount in batch_data['income_breakdown'].get('by_customer', {}).items():
301
- consolidated['income_breakdown']['by_customer'][customer] = \
302
- consolidated['income_breakdown']['by_customer'].get(customer, 0) + amount
303
-
304
- # Merge income by month
305
- for month, amount in batch_data['income_breakdown'].get('by_month', {}).items():
306
- consolidated['income_breakdown']['by_month'][month] = \
307
- consolidated['income_breakdown']['by_month'].get(month, 0) + amount
308
-
309
- # Merge expense breakdown by category
310
- if 'expense_breakdown' in batch_data:
311
- for category, amount in batch_data['expense_breakdown'].get('by_category', {}).items():
312
- consolidated['expense_breakdown']['by_category'][category] = \
313
- consolidated['expense_breakdown']['by_category'].get(category, 0) + amount
314
-
315
- # Merge expenses by month
316
- for month, amount in batch_data['expense_breakdown'].get('by_month', {}).items():
317
- consolidated['expense_breakdown']['by_month'][month] = \
318
- consolidated['expense_breakdown']['by_month'].get(month, 0) + amount
319
-
320
- # Collect key transactions
321
- if 'key_transactions' in batch_data:
322
- consolidated['key_transactions'].extend(batch_data.get('key_transactions', []))
323
-
324
- # Merge monthly totals
325
- if 'monthly_totals' in batch_data:
326
- for month, totals in batch_data['monthly_totals'].items():
327
- if month not in consolidated['monthly_totals']:
328
- consolidated['monthly_totals'][month] = {"income": 0, "expenses": 0, "net": 0}
329
-
330
- consolidated['monthly_totals'][month]['income'] += totals.get('income', 0)
331
- consolidated['monthly_totals'][month]['expenses'] += totals.get('expenses', 0)
332
- consolidated['monthly_totals'][month]['net'] += totals.get('net', 0)
333
-
334
- except json.JSONDecodeError as e:
335
- st.warning(f"Could not parse batch summary JSON: {e}")
336
- continue
337
- except Exception as e:
338
- st.warning(f"Error processing batch summary: {e}")
339
- continue
340
-
341
- # Calculate final net position
342
- consolidated['financial_summary']['net_position'] = \
343
- consolidated['financial_summary']['total_income'] - consolidated['financial_summary']['total_expenses']
344
-
345
- st.success(f"Successfully consolidated data from {len(batch_summaries)} batches covering {consolidated['total_transactions']} transactions.")
346
- return consolidated
347
-
348
- def generate_final_report(model, consolidated_data, statement_type):
349
- """Generate the final comprehensive report using consolidated batch data."""
350
- st.info("Generating final comprehensive report from consolidated data...")
351
-
352
- prompt = f"""Using this consolidated financial data, generate a comprehensive {statement_type} report:
353
-
354
- Consolidated Data:
355
- {json.dumps(consolidated_data, indent=2)}
356
-
357
- Generate a detailed {statement_type} report with the following requirements:
358
-
359
- 1. **Professional Format**: Use standard South African accounting format and terminology
360
- 2. **Clear Structure**: Organize with proper headings, subheadings, and sections
361
- 3. **Comprehensive Analysis**: Include:
362
- - Executive Summary
363
- - Detailed breakdown by categories/customers
364
- - Monthly trend analysis
365
- - Key performance indicators
366
- - Notable transactions and patterns
367
- 4. **Visual Elements**: Use tables, proper formatting for better readability
368
- 5. **Insights**: Provide meaningful business insights based on the data
369
- 6. **Currency**: Use "R" for South African Rand where appropriate
370
-
371
- Return the report in well-formatted Markdown. Do not include company name if not available.
372
- Focus on creating a professional, comprehensive financial statement that provides clear insights into the business performance."""
373
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
  try:
 
375
  response = model.generate_content([prompt])
376
- time.sleep(6)
377
- st.success("Final comprehensive report generated successfully!")
378
  return response.text
379
- except exceptions.ServiceUnavailable as e:
380
- if e.response.status_code == 504:
381
- st.error("Final report generation timed out. The consolidated data might be too large.")
382
- return None
383
- else:
384
- st.error(f"API error generating final report: {e}")
385
- raise
386
- except Exception as e:
387
- st.error(f"Error generating final report: {e}")
388
  return None
389
-
390
- def generate_batched_financial_report(model, filtered_transactions, start_date, end_date, statement_type, batch_size=400):
391
- """Main function to generate financial report using batch processing."""
392
- st.info(f"Starting batched financial report generation for {len(filtered_transactions)} transactions...")
393
-
394
- # Step 1: Split transactions into batches
395
- transaction_batches = chunk_transactions(filtered_transactions, batch_size)
396
-
397
- # Step 2: Process each batch
398
- batch_summaries = []
399
- progress_bar = st.progress(0)
400
- status_text = st.empty()
401
-
402
- for i, batch in enumerate(transaction_batches):
403
- progress = (i + 1) / len(transaction_batches)
404
- progress_bar.progress(progress)
405
- status_text.text(f"Processing batch {i + 1} of {len(transaction_batches)}...")
406
-
407
- batch_json = {"transactions": batch}
408
- summary = generate_batch_summary(model, batch_json, start_date, end_date, statement_type, i + 1, len(transaction_batches))
409
-
410
- if summary:
411
- batch_summaries.append(summary)
412
-
413
- progress_bar.progress(1.0)
414
- status_text.text("All batches processed!")
415
-
416
- if not batch_summaries:
417
- st.error("No batch summaries were successfully generated.")
418
  return None
419
-
420
- # Step 3: Consolidate batch summaries
421
- consolidated_data = consolidate_batch_summaries(batch_summaries, start_date, end_date, statement_type)
422
-
423
- # Step 4: Generate final comprehensive report
424
- final_report = generate_final_report(model, consolidated_data, statement_type)
425
-
426
- return final_report
427
- # Install required libraries:
428
- # pip install fpdf2 beautifulsoup4 markdown
429
-
430
- from bs4 import BeautifulSoup
431
- # For logging errors/info
432
 
 
433
  class PDF_Generator(FPDF):
434
- """
435
- FPDF subclass to potentially add headers/footers later if needed.
436
- Currently just a basic FPDF wrapper.
437
- """
438
- def __init__(self, orientation='P', unit='mm', format='A4'):
439
- super().__init__(orientation, unit, format)
440
- self.set_auto_page_break(auto=True, margin=15) # Enable auto page break
441
- self.set_left_margin(15)
442
- self.set_right_margin(15)
443
- self.alias_nb_pages() # Allows for page numbering {nb}
444
-
445
- # Example: Add a simple footer
446
- # def footer(self):
447
- # self.set_y(-15) # Position 1.5 cm from bottom
448
- # self.set_font('helvetica', 'I', 8)
449
- # self.cell(0, 10, f'Page {self.page_no()}/{{nb}}', 0, 0, 'C')
450
-
451
  def add_html_element(self, tag, styles):
452
- """ Processes a single HTML tag """
453
  text = tag.get_text()
454
  tag_name = tag.name.lower()
455
-
456
- # --- Basic Styling ---
457
  current_style = ''
458
- if 'b' in styles or 'strong' in styles:
459
- current_style += 'B'
460
- if 'i' in styles or 'em' in styles:
461
- current_style += 'I'
462
- # Reset font to default if no style
463
- if not current_style:
464
- self.set_font('helvetica', '', self.font_size_pt) # Reset to regular if needed
465
-
466
- # --- Handle Specific Tags ---
467
  if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
468
  level = int(tag_name[1])
469
  font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
470
- self.set_font('helvetica', 'B', font_size) # Headings usually bold
471
- self.multi_cell(0, font_size * 0.5, text, align='L') # Use approx line height
472
- self.ln(font_size * 0.3) # Space after heading
473
- self.set_font('helvetica', '', 10) # Reset font size
474
  elif tag_name == 'p':
475
  self.set_font('helvetica', current_style, 10)
476
- self.multi_cell(0, 5, text, align='L') # 5mm line height
477
- self.ln(3) # Space after paragraph
478
  elif tag_name == 'ul':
479
  self.ln(2)
480
  for item in tag.find_all('li', recursive=False):
481
- self.set_font('helvetica', '', 10) # Reset font for list item text
482
- item_text = item.get_text()
483
- self.cell(5, 5, chr(127)) # Bullet point (using a circle character)
484
- self.multi_cell(0, 5, item_text, align='L') # Remaining width
485
- self.ln(1) # Small space between items
486
- self.ln(3)
487
- elif tag_name == 'ol':
488
- self.ln(2)
489
- for i, item in enumerate(tag.find_all('li', recursive=False), 1):
490
- self.set_font('helvetica', '', 10) # Reset font for list item text
491
  item_text = item.get_text()
492
- self.cell(8, 5, f"{i}.") # Numbered item
493
  self.multi_cell(0, 5, item_text, align='L')
494
  self.ln(1)
495
  self.ln(3)
@@ -497,387 +243,168 @@ class PDF_Generator(FPDF):
497
  self.ln(5)
498
  self.process_table(tag)
499
  self.ln(5)
500
- elif tag_name in ['b', 'strong', 'i', 'em']:
501
- # Handled by style tracking within parent elements for now
502
- # Direct rendering might be needed for nested styles
503
- pass # Style is applied by parent
504
- elif tag_name == 'br':
505
- self.ln(5) # Treat <br> as a line break
506
  elif tag_name == 'hr':
507
  self.ln(2)
508
  self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
509
  self.ln(4)
510
  else:
511
- # Fallback for unknown tags - just print text content
512
- if text.strip(): # Only print if there's actual text
513
  self.set_font('helvetica', current_style, 10)
514
  self.multi_cell(0, 5, text, align='L')
515
  self.ln(1)
516
 
517
  def process_table(self, table_tag):
518
- """ Rudimentary table processing """
519
  rows = table_tag.find_all('tr')
520
- if not rows:
521
- return
522
-
523
- # --- Determine number of columns (use first row) ---
524
  header_cells = rows[0].find_all(['th', 'td'])
525
  num_cols = len(header_cells)
526
- if num_cols == 0:
527
- return
528
-
529
- # --- Calculate column widths (simple equal distribution) ---
530
- # Effective page width = Page width - left margin - right margin
531
  effective_width = self.w - self.l_margin - self.r_margin
532
  col_width = effective_width / num_cols
533
- default_cell_height = 6 # Adjust as needed
534
-
535
- # --- Process Header Row ---
536
  is_first_row = True
537
  for row in rows:
538
  cells = row.find_all(['th', 'td'])
539
- if len(cells) != num_cols:
540
- st.warning(f"Table row has inconsistent number of cells ({len(cells)} vs {num_cols}). Skipping row.")
541
- continue # Skip rows with wrong number of cells
542
-
543
- # Check page break possibility before drawing row
544
- max_cell_h = default_cell_height # Start with default
545
- # Estimate height needed (very basic, doesn't account for actual wrap height)
546
- for cell in cells:
547
- # This is a rough estimate, multi_cell calculates real height
548
- pass # Cannot easily pre-calculate multi_cell height
549
-
550
- # If estimated height exceeds remaining page space, add page
551
- # Note: FPDF's auto page break handles this better with multi_cell
552
- # if self.get_y() + max_cell_h > self.page_break_trigger:
553
- # self.add_page()
554
-
555
  is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
556
-
557
  for i, cell in enumerate(cells):
558
  cell_text = cell.get_text().strip()
559
  if is_header_row:
560
- self.set_font('helvetica', 'B', 9) # Bold header text
561
- self.set_fill_color(230, 230, 230) # Light grey fill
562
  fill = True
563
  else:
564
- self.set_font('helvetica', '', 9) # Regular text
565
- self.set_fill_color(255, 255, 255) # No fill (or alternate row color)
566
- fill = False # Or implement zebra striping
567
-
568
- # Use multi_cell for text wrapping. Draw border '1'. Align 'L'.
569
- # multi_cell automatically handles height based on content.
570
- self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, ln=3) # ln=3 moves to beginning of next cell
571
-
572
- self.ln(default_cell_height) # Move down after the row is complete (based on default height, multi_cell might make row taller)
573
- is_first_row = False # Header only applies to the first row potentially
574
-
575
 
576
  def create_pdf_report(report_text):
577
- """
578
- Creates a PDF from markdown text locally using FPDF2.
579
-
580
- Args:
581
- report_text (str): Markdown formatted report text.
582
-
583
- Returns:
584
- BytesIO: PDF file in memory buffer.
585
-
586
- Raises:
587
- Exception: If PDF generation fails.
588
- """
589
  if not report_text:
590
- st.warning("Report text is empty, skipping PDF generation.") # Log
591
  raise ValueError("Input report_text cannot be empty.")
592
-
593
  try:
594
- st.info("Starting PDF generation from markdown report...") # Log
595
- # 1. Clean Markdown
596
- cleaned_md = re.sub(r'^```markdown\s*', '', report_text, flags=re.MULTILINE)
597
- cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
598
- cleaned_md = cleaned_md.strip()
599
- # st.debug("Markdown cleaned.") # Too verbose
600
-
601
- # 2. Convert Markdown to HTML
602
- html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
603
- if not html_content:
604
- st.error("Markdown parsing resulted in empty HTML.") # Log
605
- raise ValueError("Markdown parsing resulted in empty HTML.")
606
- # st.debug("Markdown converted to HTML.") # Too verbose
607
-
608
- # 3. Parse HTML with BeautifulSoup
609
  soup = BeautifulSoup(html_content, 'html.parser')
610
- # st.debug("HTML parsed with BeautifulSoup.") # Too verbose
611
-
612
- # 4. Generate PDF using FPDF
613
  pdf = PDF_Generator()
 
 
 
614
  pdf.add_page()
615
- pdf.set_font('helvetica', '', 10) # Default font
616
- st.info("PDF document initialized, adding content...") # Log
617
-
618
- # Iterate through top-level tags in the HTML body
619
  for element in soup.find_all(recursive=False):
620
- styles = set()
621
- def traverse(tag, current_styles):
622
- local_style_added = None
623
- if tag.name in ['b', 'strong']:
624
- current_styles.add('b')
625
- local_style_added = 'b'
626
- elif tag.name in ['i', 'em']:
627
- current_styles.add('i')
628
- local_style_added = 'i'
629
-
630
- if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
631
- pdf.add_html_element(tag, current_styles.copy())
632
- else:
633
- if hasattr(tag, 'contents'):
634
- for child in tag.contents:
635
- if isinstance(child, str):
636
- pass
637
- elif hasattr(child, 'name'):
638
- traverse(child, current_styles.copy())
639
-
640
- if local_style_added and local_style_added in current_styles:
641
- current_styles.remove(local_style_added)
642
-
643
- traverse(element, styles)
644
-
645
- st.info("Content added to PDF. Outputting PDF to buffer...") # Log
646
- # 5. Output PDF to BytesIO buffer
647
- pdf_output = pdf.output(dest='S') # Output as bytes string
648
- if isinstance(pdf_output, str):
649
- # If output is string (older fpdf versions?), encode it
650
- pdf_output = pdf_output.encode('latin-1')
651
-
652
- st.success("PDF report generated successfully.") # Log success
653
  return BytesIO(pdf_output)
654
-
655
- except ImportError:
656
- st.error("FPDF2, BeautifulSoup4 or Markdown library not installed. Please install using: pip install fpdf2 beautifulsoup4 markdown")
657
- raise Exception("Missing required libraries: FPDF2, BeautifulSoup4, Markdown") from None
658
  except Exception as e:
659
- st.error(f"Failed to generate PDF locally using FPDF: {type(e).__name__}: {e}")
660
- st.exception(e) # Show traceback in streamlit logs
661
- raise Exception(f"Local FPDF PDF generation failed: {e}") from e
662
-
663
 
664
  def main():
665
  st.title("Quantitlytix AI")
666
  st.markdown("*Bank Statement Parser & Financial Report Generator*")
667
- st.info("Application started. Ready for user input.") # Log app start
668
 
669
- # Initialize session state for last error
670
- if 'last_error' not in st.session_state:
671
- st.session_state['last_error'] = None
672
-
673
- # Initialize session state for transaction date range
674
  if 'min_date' not in st.session_state:
675
- st.session_state['min_date'] = date(2024, 1, 1) # Default min date
676
  if 'max_date' not in st.session_state:
677
- st.session_state['max_date'] = date(2024, 12, 31) # Default max date
 
 
678
 
679
- # Sidebar: Select input type: Bulk PDF or CSV Upload
680
  input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
681
- st.info(f"Input type selected: {input_type}") # Log input type
682
-
683
- all_transactions = []
684
 
685
  if input_type == "Bulk Bank Statement Upload":
686
  uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
687
  if uploaded_files:
688
- st.info(f"User uploaded {len(uploaded_files)} PDF file(s).") # Log file upload
689
- total_files = len(uploaded_files)
690
- st.write(f"{total_files} PDF file(s) uploaded.")
691
- try:
692
- model = configure_gemini(api_key)
693
-
694
- # Create a progress bar
695
- progress_bar = st.progress(0)
696
- status_text = st.empty()
697
-
698
- file_progress = 0
699
- for file_index, uploaded_file in enumerate(uploaded_files):
700
- st.info(f"Starting processing for file {file_index+1}/{total_files}: {uploaded_file.name}") # Log individual file start
701
- # Update file progress
702
- file_progress = (file_index) / total_files
703
- progress_bar.progress(overall_progress) # Corrected variable name
704
- status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
705
-
706
- # Get PDF reader and page count
707
- try:
708
- pdf_reader, total_pages = read_pdf_pages(uploaded_file)
709
-
710
- if total_pages == 0:
711
- st.warning(f"No pages found in {uploaded_file.name}. Skipping file.") # Log
712
- continue
713
-
714
- with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
715
- # Define progress callback for page-by-page processing
716
- def update_page_progress(page_progress, message):
717
- # Calculate overall progress (file progress + current file's contribution)
718
- overall_progress = file_progress + (page_progress * (1/total_files))
719
- progress_bar.progress(overall_progress)
720
- status_text.text(f"File {file_index+1}/{total_files}: {message}")
721
-
722
- # Process the PDF page by page
723
- st.info(f"Calling process_pdf_pages for {uploaded_file.name}...") # Log
724
- file_transactions = process_pdf_pages(
725
- model,
726
- pdf_reader,
727
- total_pages,
728
- progress_callback=update_page_progress
729
- )
730
-
731
- # Add transactions from this file to overall list
732
- all_transactions.extend(file_transactions)
733
- st.info(f"Finished processing {uploaded_file.name}. Extracted {len(file_transactions)} transactions.") # Log file completion
734
-
735
- except Exception as e:
736
- st.error(f"Error processing {uploaded_file.name}: {str(e)}") # Log specific file error
737
- st.exception(e) # Show traceback
738
- continue
739
-
740
- # Complete the progress bar
741
- progress_bar.progress(1.0)
742
- status_text.text(f"Completed processing {total_files} files!")
743
- st.success(f"All PDF files processed. Total transactions collected: {len(all_transactions)}.") # Log overall completion
744
-
745
- except Exception as e:
746
- st.error(f"Overall error during PDF document processing: {str(e)}") # Log general error during PDF handling
747
- st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
748
- st.exception(e) # Show traceback
749
  elif input_type == "CSV Upload":
750
  uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
751
  if uploaded_csv:
752
- st.info(f"User uploaded CSV file: {uploaded_csv.name}.") # Log
753
- try:
754
- df = pd.read_csv(uploaded_csv)
755
- # Drop 'Unnamed:' columns from the uploaded CSV
756
- df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
757
- st.write("CSV Data Preview:")
758
- st.dataframe(df.head())
759
- # Convert dataframe to list of transaction dictionaries
760
- transactions = df.to_dict(orient='records')
761
- all_transactions.extend(transactions)
762
- st.success(f"Successfully loaded {len(transactions)} transactions from CSV.") # Log
763
- except Exception as e:
764
- st.error(f"Error processing CSV file: {str(e)}") # Log CSV error
765
- st.exception(e)
766
-
767
- # If transactions are loaded, show DataFrame and update date ranges
768
- if all_transactions:
769
- st.info("Consolidating and displaying all extracted transactions.") # Log
770
- df = pd.DataFrame(all_transactions)
771
- # Drop 'Unnamed:' columns from the final DataFrame
772
- if not df.empty:
773
  df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
774
- try:
775
- # Process dates and extract min/max dates for date range inputs
776
- st.info("Parsing transaction dates and determining date range.") # Log
777
- df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
778
-
779
- # Get min and max dates from transactions
780
- if not df['Date'].isna().all():
781
- min_date = df['Date'].min().date()
782
- max_date = df['Date'].max().date()
783
-
784
- # Update session state with actual transaction date range
785
- st.session_state['min_date'] = min_date
786
- st.session_state['max_date'] = max_date
787
- st.info(f"Determined transaction date range: {min_date} to {max_date}.") # Log
788
- else:
789
- st.warning("Could not determine valid date range from transactions. Using default dates.") # Log
790
-
791
- # Format dates for display
792
- df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
793
-
794
- except Exception as e:
795
- st.warning("Some transaction dates could not be formatted correctly.")
796
- st.exception(e)
797
-
798
- st.success("Transactions loaded successfully!")
799
- st.write("### Extracted Transactions")
800
- st.dataframe(df)
801
- else:
802
- st.warning("No valid transactions could be extracted from the documents.")
803
  else:
804
- st.info("No transactions loaded yet. Upload files to begin.") # Initial state log
805
 
806
- # Financial report generation parameters
807
  st.write("### Generate Financial Report")
808
  col1, col2 = st.columns(2)
809
  with col1:
810
- # Use the min_date from transactions if available, otherwise use default
811
  start_date = st.date_input("Start Date", st.session_state['min_date'])
812
  with col2:
813
- # Use the max_date from transactions if available, otherwise use default
814
  end_date = st.date_input("End Date", st.session_state['max_date'])
815
-
816
  statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
817
 
818
  if st.button("Generate Financial Report"):
819
- st.info(f"User clicked 'Generate Financial Report' for {statement_type} from {start_date} to {end_date}.")
820
- if not all_transactions:
821
  st.error("No transactions available to generate report. Please upload files first.")
822
  else:
823
- # Filter transactions by date
824
- st.info(f"Filtering {len(all_transactions)} transactions for the period {start_date} to {end_date}...")
825
- filtered_transactions = []
826
- for transaction in all_transactions:
827
- try:
828
- transaction_date = datetime.strptime(transaction.get('Date'), '%d/%m/%Y').date()
829
- if start_date <= transaction_date <= end_date:
830
- filtered_transactions.append(transaction)
831
- except (ValueError, TypeError):
832
- st.warning(f"Could not parse date for transaction, skipping: {transaction}")
833
- continue
834
-
835
- if not filtered_transactions:
836
- st.warning("No transactions found within the selected date range. Please adjust dates or upload relevant files.")
837
  else:
838
- st.info(f"Found {len(filtered_transactions)} transactions within the selected date range.")
 
839
  try:
840
- model1 = configure_gemini1(api_key)
841
-
842
- # Decide whether to use batched or regular processing
843
- if len(filtered_transactions) > 600:
844
- st.info(f"Large dataset detected ({len(filtered_transactions)} transactions). Using batched processing...")
845
- with st.spinner("Generating batched financial report..."):
846
- report_text = generate_batched_financial_report(
847
- model1, filtered_transactions, start_date, end_date, statement_type
848
- )
849
- else:
850
- st.info("Using standard processing for smaller dataset...")
851
- combined_json = {"transactions": filtered_transactions}
852
- with st.spinner("Generating financial report..."):
853
- report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
854
-
855
- if report_text:
856
- st.success("Financial report generated successfully!")
857
-
858
- # Display the report as markdown
859
- st.markdown("### Financial Report Preview")
860
- st.markdown(report_text)
861
-
862
- # Create PDF from markdown
863
- try:
864
- st.info("Generating PDF from the report...")
865
  pdf_buffer = create_pdf_report(report_text)
866
  st.download_button(
867
  label="Download Financial Report as PDF",
868
- data=pdf_buffer.getvalue(),
869
  file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
870
  mime="application/pdf"
871
  )
872
- st.success("PDF download ready.")
873
- except Exception as e:
874
- st.error(f"Error generating PDF: {str(e)}")
875
- st.exception(e)
876
-
877
  except Exception as e:
878
- st.error(f"Error generating financial report: {str(e)}")
879
- if "504" in str(e):
880
- st.info("Consider using a smaller date range or fewer transactions.")
881
  st.exception(e)
882
 
883
  if __name__ == "__main__":
 
2
  import json
3
  import os
4
  import time
5
+ from datetime import datetime, date
6
  from io import BytesIO
 
7
  import pandas as pd
8
  import streamlit as st
9
  import google.generativeai as genai
10
  import pypdf
11
  from fpdf import FPDF
 
 
12
  from google.api_core import exceptions
13
+ import markdown
14
+ from bs4 import BeautifulSoup
15
 
16
+ # Configure API key for Gemini - Ensure this is set in your environment variables
17
  api_key = os.getenv('Gemini')
18
 
19
  def configure_gemini(api_key):
20
+ """
21
+ Configures the Gemini model for transaction extraction as specified by the user.
22
+ """
23
+ st.info("Configuring Gemini API for transaction extraction...")
24
  genai.configure(api_key=api_key)
25
+ # Using the model specified by the user for this task
26
  return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
27
 
28
  def configure_gemini1(api_key):
29
+ """
30
+ Configures the Gemini model for report generation as specified by the user.
31
+ """
32
+ st.info("Configuring Gemini API for report generation...")
33
  genai.configure(api_key=api_key)
34
+ # Using the state-of-the-art model for high-quality report formatting
35
  return genai.GenerativeModel('gemini-2.5-pro')
36
+
 
37
  def read_pdf_pages(file_obj):
38
+ st.info(f"Reading PDF pages from {file_obj.name}...")
39
+ file_obj.seek(0)
40
  pdf_reader = pypdf.PdfReader(file_obj)
41
  total_pages = len(pdf_reader.pages)
42
+ st.info(f"Found {total_pages} pages in PDF.")
43
  return pdf_reader, total_pages
44
 
 
45
  def extract_page_text(pdf_reader, page_num):
 
46
  if page_num < len(pdf_reader.pages):
47
  text = pdf_reader.pages[page_num].extract_text()
48
+ if not text or not text.strip():
49
+ st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.")
50
  return text if text else ""
51
  return ""
52
 
 
53
  def process_with_gemini(model, text):
54
  prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
55
  - Date (format DD/MM/YYYY)
 
76
  ]
77
  }"""
78
  try:
 
79
  response = model.generate_content([prompt, text])
80
+ time.sleep(6) # Retaining original sleep time as per user's working code
 
81
  return response.text
82
+ except exceptions.GoogleAPICallError as e:
83
+ st.error(f"A Google API call error occurred during transaction extraction: {e}")
84
+ if "context length" in str(e):
85
+ st.warning("The text on a single PDF page may be too long for the extraction model.")
86
+ return None
 
 
87
  except Exception as e:
88
+ st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}")
89
  return None
90
 
 
91
  def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
92
  all_transactions = []
93
+ st.info(f"Starting page-by-page PDF processing for {total_pages} pages...")
94
+
 
95
  for page_num in range(total_pages):
 
96
  if progress_callback:
97
  progress_callback(page_num / total_pages, f"Processing page {page_num + 1} of {total_pages}")
98
+
 
99
  page_text = extract_page_text(pdf_reader, page_num)
 
100
  if not page_text.strip():
101
+ continue
102
+
103
+ st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...")
 
 
104
  json_response = process_with_gemini(model, page_text)
105
+
106
  if json_response:
107
+ # A more robust regex to find the JSON block
108
+ match = re.search(r'\{.*\}', json_response, re.DOTALL)
109
+ if not match:
110
+ st.warning(f"No valid JSON object found in Gemini response for page {page_num + 1}.")
111
+ continue
112
+
113
+ json_str = match.group(0)
 
 
 
 
114
  try:
115
  data = json.loads(json_str)
116
  transactions = data.get('transactions', [])
117
  if transactions:
118
+ st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.")
119
  all_transactions.extend(transactions)
 
 
120
  except json.JSONDecodeError:
121
+ st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}.")
122
+ continue
123
  else:
124
+ st.warning(f"Gemini returned no response for page {page_num + 1}.")
125
 
126
+ st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.")
127
  return all_transactions
128
 
129
+ def aggregate_financial_data(transactions: list, statement_type: str):
130
+ """
131
+ Aggregates transaction data using Pandas for high performance and accuracy.
132
+ This function does the heavy lifting locally, preparing a small summary for the LLM.
133
+ """
134
+ st.info(f"Performing local financial aggregation for {len(transactions)} transactions...")
135
+ if not transactions:
136
+ st.warning("No transactions to aggregate.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  return None
138
 
139
+ df = pd.DataFrame(transactions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ # --- Data Cleaning and Preparation ---
142
+ df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce').fillna(0)
143
+ df['Type'] = df['Type'].str.lower()
144
+
145
+ # --- Core Financial Calculations ---
146
+ total_income = df[df['Type'] == 'income']['Amount'].sum()
147
+ total_expenses = df[df['Type'] == 'expense']['Amount'].sum()
148
+ net_position = total_income - total_expenses
 
 
 
 
 
 
149
 
150
+ # --- Build the Aggregated Data Structure ---
151
+ aggregated_data = {
152
+ "total_income": total_income,
153
+ "total_expenses": total_expenses,
154
+ "net_position": net_position,
155
+ "transaction_count": len(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ # --- Statement-Specific Aggregations ---
159
+ if statement_type == "Income Statement":
160
+ expense_breakdown = df[df['Type'] == 'expense'].groupby('Category_of_expense')['Amount'].sum().round(2).to_dict()
161
+ aggregated_data["expense_breakdown"] = expense_breakdown
162
+ income_breakdown = df[df['Type'] == 'income'].groupby('Customer_name')['Amount'].sum().round(2).to_dict()
163
+ aggregated_data["income_breakdown"] = income_breakdown
164
+ elif statement_type == "Cashflow Statement":
165
+ aggregated_data["operating_cash_flow"] = net_position
166
+ aggregated_data["cash_inflows"] = total_income
167
+ aggregated_data["cash_outflows"] = total_expenses
168
+ elif statement_type == "Balance Sheet":
169
+ aggregated_data["notes"] = "Balance Sheets require asset and liability balances, not just transaction flows. This data can only show the net change in cash over the period."
170
+
171
+ st.success("Local financial aggregation complete.")
172
+ return aggregated_data
173
+
174
+ def generate_financial_report(model, aggregated_data, start_date, end_date, statement_type):
175
+ """
176
+ Generates a financial report by sending a small, pre-aggregated summary to the LLM.
177
+ The LLM's job is to format this data professionally, not to calculate it.
178
+ """
179
+ st.info(f"Preparing to generate {statement_type} with pre-aggregated data...")
180
+ prompt = f"""
181
+ Based on the following pre-aggregated financial summary JSON data:
182
+ {json.dumps(aggregated_data, indent=2)}
183
+
184
+ Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
185
+
186
+ Specific Formatting and Content Requirements:
187
+
188
+ Standard Accounting Structure (South Africa Focus): Organize the {statement_type} according to typical accounting practices followed in South Africa (e.g., for an Income Statement, clearly separate Revenue, Cost of Goods Sold, Gross Profit, Operating Expenses, and Net Income, in nice tables considering local terminology where applicable). If unsure of specific local variations, adhere to widely accepted international accounting structures.
189
+ Clear Headings and Subheadings: Use distinct and informative headings and subheadings in English to delineate different sections of the report. Ensure these are visually prominent.
190
+ Consistent Formatting: Maintain consistent formatting for monetary values (using "R" for South African Rand), dates, and alignment.
191
+ Totals and Subtotals: Clearly display totals for relevant categories and subtotals where appropriate.
192
+ Descriptive Line Items: Use the provided aggregated data to create clear line items.
193
+ Key Insights: Include a brief section (e.g., "Key Highlights" or "Summary") that identifies significant trends or key performance indicators derived from the provided summary data.
194
+ Concise Summary: Provide a concluding summary paragraph that encapsulates the overall financial picture.
195
+ Special Case for Balance Sheet: If the request is for a "Balance Sheet," explain professionally that a balance sheet cannot be generated from transaction data alone, as it requires a snapshot of assets, liabilities, and equity. Then, present the available cash flow information as a helpful alternative.
196
+
197
+ Format the entire report in Markdown for better visual structure.
198
+ Do not name the company if a name is not there; refer to it as "The Business". Return just the report and nothing else.
199
+ """
200
  try:
201
+ st.info("Sending request to Gemini for final report formatting...")
202
  response = model.generate_content([prompt])
203
+ time.sleep(7) # Retaining original sleep time
204
+ st.success("Successfully received formatted financial report from Gemini.")
205
  return response.text
206
+ except exceptions.GoogleAPICallError as e:
207
+ st.error(f"A Google API call error occurred during report generation: {e}")
 
 
 
 
 
 
 
208
  return None
209
+ except Exception as e:
210
+ st.error(f"An unexpected error occurred during Gemini report generation: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+ # --- PDF Generation Logic (Unaltered as per your request) ---
214
  class PDF_Generator(FPDF):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def add_html_element(self, tag, styles):
 
216
  text = tag.get_text()
217
  tag_name = tag.name.lower()
 
 
218
  current_style = ''
219
+ if 'b' in styles or 'strong' in styles: current_style += 'B'
220
+ if 'i' in styles or 'em' in styles: current_style += 'I'
221
+ if not current_style: self.set_font('helvetica', '', self.font_size_pt)
 
 
 
 
 
 
222
  if tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
223
  level = int(tag_name[1])
224
  font_size = {1: 18, 2: 16, 3: 14, 4: 12, 5: 11, 6: 10}.get(level, 10)
225
+ self.set_font('helvetica', 'B', font_size)
226
+ self.multi_cell(0, font_size * 0.5, text, align='L')
227
+ self.ln(font_size * 0.3)
228
+ self.set_font('helvetica', '', 10)
229
  elif tag_name == 'p':
230
  self.set_font('helvetica', current_style, 10)
231
+ self.multi_cell(0, 5, text, align='L')
232
+ self.ln(3)
233
  elif tag_name == 'ul':
234
  self.ln(2)
235
  for item in tag.find_all('li', recursive=False):
236
+ self.set_font('helvetica', '', 10)
 
 
 
 
 
 
 
 
 
237
  item_text = item.get_text()
238
+ self.cell(5, 5, chr(127))
239
  self.multi_cell(0, 5, item_text, align='L')
240
  self.ln(1)
241
  self.ln(3)
 
243
  self.ln(5)
244
  self.process_table(tag)
245
  self.ln(5)
246
+ elif tag_name == 'br': self.ln(5)
 
 
 
 
 
247
  elif tag_name == 'hr':
248
  self.ln(2)
249
  self.line(self.get_x(), self.get_y(), self.w - self.r_margin, self.get_y())
250
  self.ln(4)
251
  else:
252
+ if text.strip():
 
253
  self.set_font('helvetica', current_style, 10)
254
  self.multi_cell(0, 5, text, align='L')
255
  self.ln(1)
256
 
257
  def process_table(self, table_tag):
 
258
  rows = table_tag.find_all('tr')
259
+ if not rows: return
 
 
 
260
  header_cells = rows[0].find_all(['th', 'td'])
261
  num_cols = len(header_cells)
262
+ if num_cols == 0: return
 
 
 
 
263
  effective_width = self.w - self.l_margin - self.r_margin
264
  col_width = effective_width / num_cols
265
+ default_cell_height = 6
 
 
266
  is_first_row = True
267
  for row in rows:
268
  cells = row.find_all(['th', 'td'])
269
+ if len(cells) != num_cols: continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  is_header_row = all(c.name == 'th' for c in cells) or (is_first_row and any(c.name == 'th' for c in cells))
 
271
  for i, cell in enumerate(cells):
272
  cell_text = cell.get_text().strip()
273
  if is_header_row:
274
+ self.set_font('helvetica', 'B', 9)
275
+ self.set_fill_color(230, 230, 230)
276
  fill = True
277
  else:
278
+ self.set_font('helvetica', '', 9)
279
+ fill = False
280
+ self.multi_cell(col_width, default_cell_height, cell_text, border=1, align='L', fill=fill, new_x="RIGHT", new_y="TOP")
281
+ self.ln(default_cell_height)
282
+ is_first_row = False
 
 
 
 
 
 
283
 
284
  def create_pdf_report(report_text):
 
 
 
 
 
 
 
 
 
 
 
 
285
  if not report_text:
286
+ st.warning("Report text is empty, skipping PDF generation.")
287
  raise ValueError("Input report_text cannot be empty.")
 
288
  try:
289
+ st.info("Starting PDF generation from markdown report...")
290
+ cleaned_md = re.sub(r'```markdown|```', '', report_text, flags=re.MULTILINE).strip()
291
+ html_content = markdown.markdown(cleaned_md, extensions=['tables'])
 
 
 
 
 
 
 
 
 
 
 
 
292
  soup = BeautifulSoup(html_content, 'html.parser')
 
 
 
293
  pdf = PDF_Generator()
294
+ pdf.set_auto_page_break(auto=True, margin=15)
295
+ pdf.set_left_margin(15)
296
+ pdf.set_right_margin(15)
297
  pdf.add_page()
298
+ pdf.set_font('helvetica', '', 10)
 
 
 
299
  for element in soup.find_all(recursive=False):
300
+ pdf.add_html_element(element, set())
301
+ st.info("Content added to PDF. Outputting PDF to buffer...")
302
+ pdf_output = pdf.output(dest='S').encode('latin-1')
303
+ st.success("PDF report generated successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  return BytesIO(pdf_output)
 
 
 
 
305
  except Exception as e:
306
+ st.error(f"Failed to generate PDF: {e}")
307
+ st.exception(e)
308
+ raise
 
309
 
310
  def main():
311
  st.title("Quantitlytix AI")
312
  st.markdown("*Bank Statement Parser & Financial Report Generator*")
 
313
 
 
 
 
 
 
314
  if 'min_date' not in st.session_state:
315
+ st.session_state['min_date'] = date(2024, 1, 1)
316
  if 'max_date' not in st.session_state:
317
+ st.session_state['max_date'] = date.today()
318
+ if 'transactions' not in st.session_state:
319
+ st.session_state['transactions'] = []
320
 
 
321
  input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
 
 
 
322
 
323
  if input_type == "Bulk Bank Statement Upload":
324
  uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
325
  if uploaded_files:
326
+ st.info(f"User uploaded {len(uploaded_files)} PDF file(s).")
327
+ model = configure_gemini(api_key)
328
+ progress_bar = st.progress(0)
329
+ all_transactions = []
330
+ for i, file in enumerate(uploaded_files):
331
+ st.text(f"Processing {file.name}...")
332
+ pdf_reader, total_pages = read_pdf_pages(file)
333
+ if total_pages > 0:
334
+ file_transactions = process_pdf_pages(model, pdf_reader, total_pages)
335
+ all_transactions.extend(file_transactions)
336
+ progress_bar.progress((i + 1) / len(uploaded_files))
337
+ st.session_state['transactions'] = all_transactions
338
+ st.success(f"All PDF files processed. Total transactions collected: {len(st.session_state['transactions'])}.")
339
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  elif input_type == "CSV Upload":
341
  uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
342
  if uploaded_csv:
343
+ st.info(f"User uploaded CSV file: {uploaded_csv.name}.")
344
+ df = pd.read_csv(uploaded_csv)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
346
+ st.session_state['transactions'] = df.to_dict(orient='records')
347
+ st.success(f"Successfully loaded {len(st.session_state['transactions'])} transactions from CSV.")
348
+
349
+ if st.session_state['transactions']:
350
+ st.info("Consolidating and displaying all extracted transactions.")
351
+ df = pd.DataFrame(st.session_state['transactions'])
352
+ df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
353
+ df.dropna(subset=['Date'], inplace=True)
354
+ if not df.empty:
355
+ min_date = df['Date'].min().date()
356
+ max_date = df['Date'].max().date()
357
+ st.session_state['min_date'] = min_date
358
+ st.session_state['max_date'] = max_date
359
+ st.write("### Extracted Transactions")
360
+ st.dataframe(df.astype(str))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
  else:
362
+ st.info("No transactions loaded yet. Upload files to begin.")
363
 
 
364
  st.write("### Generate Financial Report")
365
  col1, col2 = st.columns(2)
366
  with col1:
 
367
  start_date = st.date_input("Start Date", st.session_state['min_date'])
368
  with col2:
 
369
  end_date = st.date_input("End Date", st.session_state['max_date'])
 
370
  statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
371
 
372
  if st.button("Generate Financial Report"):
373
+ if not st.session_state['transactions']:
 
374
  st.error("No transactions available to generate report. Please upload files first.")
375
  else:
376
+ df = pd.DataFrame(st.session_state['transactions'])
377
+ df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True)
378
+ mask = (df['Date'] >= pd.to_datetime(start_date)) & (df['Date'] <= pd.to_datetime(end_date))
379
+ filtered_df = df.loc[mask]
380
+
381
+ if filtered_df.empty:
382
+ st.warning("No transactions found within the selected date range.")
 
 
 
 
 
 
 
383
  else:
384
+ st.info(f"Found {len(filtered_df)} transactions within the selected date range.")
385
+ filtered_transactions_list = filtered_df.to_dict(orient='records')
386
  try:
387
+ with st.spinner("Aggregating financial data locally..."):
388
+ aggregated_summary = aggregate_financial_data(filtered_transactions_list, statement_type)
389
+ if aggregated_summary:
390
+ with st.spinner("Generating formatted report with Gemini..."):
391
+ model1 = configure_gemini1(api_key)
392
+ report_text = generate_financial_report(model1, aggregated_summary, start_date, end_date, statement_type)
393
+ if report_text:
394
+ st.success("Financial report generated successfully!")
395
+ st.markdown("### Financial Report Preview")
396
+ st.markdown(report_text, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  pdf_buffer = create_pdf_report(report_text)
398
  st.download_button(
399
  label="Download Financial Report as PDF",
400
+ data=pdf_buffer,
401
  file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
402
  mime="application/pdf"
403
  )
404
+ else:
405
+ st.error("Failed to generate the financial report from the aggregated data.")
 
 
 
406
  except Exception as e:
407
+ st.error(f"An unexpected error occurred during the report generation process: {e}")
 
 
408
  st.exception(e)
409
 
410
  if __name__ == "__main__":