rairo commited on
Commit
91c8199
·
verified ·
1 Parent(s): 4ed1df1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -38
app.py CHANGED
@@ -19,24 +19,31 @@ from html_to_markdown import convert_to_markdown
19
  api_key = os.getenv('Gemini')
20
 
21
  def configure_gemini(api_key):
 
22
  genai.configure(api_key=api_key)
23
  return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
24
 
25
  def configure_gemini1(api_key):
 
26
  genai.configure(api_key=api_key)
27
  return genai.GenerativeModel('gemini-2.5-flash')
28
 
29
  # Read PDF content page by page from a file-like object
30
  def read_pdf_pages(file_obj):
 
31
  file_obj.seek(0) # Ensure the file pointer is at the start
32
  pdf_reader = pypdf.PdfReader(file_obj)
33
  total_pages = len(pdf_reader.pages)
 
34
  return pdf_reader, total_pages
35
 
36
  # Extract text from a specific page
37
  def extract_page_text(pdf_reader, page_num):
 
38
  if page_num < len(pdf_reader.pages):
39
  text = pdf_reader.pages[page_num].extract_text()
 
 
40
  return text if text else ""
41
  return ""
42
 
@@ -67,19 +74,26 @@ def process_with_gemini(model, text):
67
  ]
68
  }"""
69
  try:
 
70
  response = model.generate_content([prompt, text])
71
  time.sleep(6) # Sleep for 6 seconds to work around rate limit
 
72
  return response.text
73
  except exceptions.ServiceUnavailable as e:
74
  if e.response.status_code == 504:
75
  st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
76
  return None
77
  else:
 
78
  raise
 
 
 
79
 
80
  # Process PDF page by page to handle large files
81
  def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
82
  all_transactions = []
 
83
 
84
  # Process pages individually or in small chunks
85
  for page_num in range(total_pages):
@@ -91,9 +105,11 @@ def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
91
  page_text = extract_page_text(pdf_reader, page_num)
92
 
93
  if not page_text.strip():
 
94
  continue # Skip empty pages
95
 
96
  # Process the page with Gemini
 
97
  json_response = process_with_gemini(model, page_text)
98
 
99
  if json_response:
@@ -102,6 +118,7 @@ def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
102
  end_idx = json_response.rfind('}') + 1
103
 
104
  if start_idx == -1 or end_idx == 0:
 
105
  continue # Skip invalid JSON
106
 
107
  json_str = json_response[start_idx:end_idx]
@@ -110,16 +127,23 @@ def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
110
  try:
111
  data = json.loads(json_str)
112
  transactions = data.get('transactions', [])
113
-
114
- # Add transactions to the overall list
115
- all_transactions.extend(transactions)
 
 
116
  except json.JSONDecodeError:
 
117
  continue # Skip invalid JSON
118
-
 
 
 
119
  return all_transactions
120
 
121
  # Generate financial report from aggregated JSON transactions and chosen parameters
122
  def generate_financial_report(model, json_data, start_date, end_date, statement_type):
 
123
  prompt = f"""Based on the following transactions JSON data:
124
  {json.dumps(json_data)}
125
  Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
@@ -137,8 +161,10 @@ Concise Summary: Provide a concluding summary paragraph that encapsulates the ov
137
  Format the report in Markdown for better visual structure.
138
  Do not name the company if name is not there and return just the report and nothing else."""
139
  try:
 
140
  response = model.generate_content([prompt])
141
  time.sleep(7) # Sleep for 7 seconds to work around rate limit
 
142
  return response.text
143
  except exceptions.ServiceUnavailable as e:
144
  if e.response.status_code == 504:
@@ -146,7 +172,11 @@ Do not name the company if name is not there and return just the report and noth
146
  st.session_state['last_error'] = "504" # Store the error in session state
147
  return None
148
  else:
 
149
  raise
 
 
 
150
 
151
 
152
  # Install required libraries:
@@ -312,34 +342,38 @@ def create_pdf_report(report_text):
312
  Exception: If PDF generation fails.
313
  """
314
  if not report_text:
 
315
  raise ValueError("Input report_text cannot be empty.")
316
 
317
  try:
 
318
  # 1. Clean Markdown
319
  cleaned_md = re.sub(r'^```markdown\s*', '', report_text, flags=re.MULTILINE)
320
  cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
321
  cleaned_md = cleaned_md.strip()
 
322
 
323
  # 2. Convert Markdown to HTML
324
  html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
325
  if not html_content:
 
326
  raise ValueError("Markdown parsing resulted in empty HTML.")
 
327
 
328
  # 3. Parse HTML with BeautifulSoup
329
  soup = BeautifulSoup(html_content, 'html.parser')
 
330
 
331
  # 4. Generate PDF using FPDF
332
  pdf = PDF_Generator()
333
  pdf.add_page()
334
  pdf.set_font('helvetica', '', 10) # Default font
 
335
 
336
  # Iterate through top-level tags in the HTML body
337
  for element in soup.find_all(recursive=False):
338
- # Track basic nested styles like bold/italic
339
- # This is very basic and might not handle complex nesting well
340
  styles = set()
341
  def traverse(tag, current_styles):
342
- # Check for styling tags
343
  local_style_added = None
344
  if tag.name in ['b', 'strong']:
345
  current_styles.add('b')
@@ -348,32 +382,29 @@ def create_pdf_report(report_text):
348
  current_styles.add('i')
349
  local_style_added = 'i'
350
 
351
- # If it's a text node, process it with current styles (handled within add_html_element)
352
- # If it's a known block element, process it
353
  if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
354
- pdf.add_html_element(tag, current_styles.copy()) # Pass styles down
355
  else:
356
- # Recursively process children for other tags (like span, div, or styling tags)
357
  if hasattr(tag, 'contents'):
358
  for child in tag.contents:
359
- if isinstance(child, str): # Handle text nodes directly if needed (often handled by parent)
360
- pass # Usually text is grabbed by parent's get_text()
361
- elif hasattr(child, 'name'): # Check if it's a tag
362
- traverse(child, current_styles.copy()) # Pass styles down
363
 
364
- # Remove local style after processing the tag and its children
365
  if local_style_added and local_style_added in current_styles:
366
  current_styles.remove(local_style_added)
367
 
368
  traverse(element, styles)
369
 
370
-
371
  # 5. Output PDF to BytesIO buffer
372
  pdf_output = pdf.output(dest='S') # Output as bytes string
373
  if isinstance(pdf_output, str):
374
  # If output is string (older fpdf versions?), encode it
375
  pdf_output = pdf_output.encode('latin-1')
376
 
 
377
  return BytesIO(pdf_output)
378
 
379
  except ImportError:
@@ -382,17 +413,13 @@ def create_pdf_report(report_text):
382
  except Exception as e:
383
  st.error(f"Failed to generate PDF locally using FPDF: {type(e).__name__}: {e}")
384
  st.exception(e) # Show traceback in streamlit logs
385
- # Log intermediate steps if possible
386
- # print("--- Cleaned Markdown ---")
387
- # print(cleaned_md)
388
- # print("--- Generated HTML ---")
389
- # print(html_content)
390
  raise Exception(f"Local FPDF PDF generation failed: {e}") from e
391
 
392
 
393
  def main():
394
  st.title("Quantitlytix AI")
395
  st.markdown("*Bank Statement Parser & Financial Report Generator*")
 
396
 
397
  # Initialize session state for last error
398
  if 'last_error' not in st.session_state:
@@ -406,12 +433,14 @@ def main():
406
 
407
  # Sidebar: Select input type: Bulk PDF or CSV Upload
408
  input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
 
409
 
410
  all_transactions = []
411
 
412
  if input_type == "Bulk Bank Statement Upload":
413
  uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
414
  if uploaded_files:
 
415
  total_files = len(uploaded_files)
416
  st.write(f"{total_files} PDF file(s) uploaded.")
417
  try:
@@ -423,9 +452,10 @@ def main():
423
 
424
  file_progress = 0
425
  for file_index, uploaded_file in enumerate(uploaded_files):
 
426
  # Update file progress
427
  file_progress = (file_index) / total_files
428
- progress_bar.progress(file_progress)
429
  status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
430
 
431
  # Get PDF reader and page count
@@ -433,7 +463,7 @@ def main():
433
  pdf_reader, total_pages = read_pdf_pages(uploaded_file)
434
 
435
  if total_pages == 0:
436
- st.warning(f"No pages found in {uploaded_file.name}.")
437
  continue
438
 
439
  with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
@@ -445,6 +475,7 @@ def main():
445
  status_text.text(f"File {file_index+1}/{total_files}: {message}")
446
 
447
  # Process the PDF page by page
 
448
  file_transactions = process_pdf_pages(
449
  model,
450
  pdf_reader,
@@ -454,21 +485,26 @@ def main():
454
 
455
  # Add transactions from this file to overall list
456
  all_transactions.extend(file_transactions)
 
457
 
458
  except Exception as e:
459
- st.error(f"Error processing {uploaded_file.name}: {str(e)}")
 
460
  continue
461
 
462
  # Complete the progress bar
463
  progress_bar.progress(1.0)
464
  status_text.text(f"Completed processing {total_files} files!")
 
465
 
466
  except Exception as e:
467
- st.error(f"Error processing PDF documents: {str(e)}")
468
  st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
 
469
  elif input_type == "CSV Upload":
470
  uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
471
  if uploaded_csv:
 
472
  try:
473
  df = pd.read_csv(uploaded_csv)
474
  # Drop 'Unnamed:' columns from the uploaded CSV
@@ -478,19 +514,21 @@ def main():
478
  # Convert dataframe to list of transaction dictionaries
479
  transactions = df.to_dict(orient='records')
480
  all_transactions.extend(transactions)
 
481
  except Exception as e:
482
- st.error(f"Error processing CSV file: {str(e)}")
 
483
 
484
  # If transactions are loaded, show DataFrame and update date ranges
485
  if all_transactions:
 
486
  df = pd.DataFrame(all_transactions)
487
  # Drop 'Unnamed:' columns from the final DataFrame
488
  if not df.empty:
489
  df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
490
  try:
491
-
492
-
493
  # Process dates and extract min/max dates for date range inputs
 
494
  df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
495
 
496
  # Get min and max dates from transactions
@@ -501,12 +539,15 @@ def main():
501
  # Update session state with actual transaction date range
502
  st.session_state['min_date'] = min_date
503
  st.session_state['max_date'] = max_date
 
 
 
504
 
505
  # Format dates for display
506
  df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
507
 
508
  except Exception as e:
509
- st.warning("Some data could not be formatted correctly.")
510
  st.exception(e)
511
 
512
  st.success("Transactions loaded successfully!")
@@ -515,7 +556,7 @@ def main():
515
  else:
516
  st.warning("No valid transactions could be extracted from the documents.")
517
  else:
518
- st.info("No transactions loaded yet.")
519
 
520
  # Financial report generation parameters
521
  st.write("### Generate Financial Report")
@@ -530,10 +571,12 @@ def main():
530
  statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
531
 
532
  if st.button("Generate Financial Report"):
 
533
  if not all_transactions:
534
- st.error("No transactions available to generate report.")
535
  else:
536
  # Filter transactions by date
 
537
  filtered_transactions = []
538
  for transaction in all_transactions:
539
  try:
@@ -541,19 +584,20 @@ def main():
541
  if start_date <= transaction_date <= end_date:
542
  filtered_transactions.append(transaction)
543
  except (ValueError, TypeError):
544
- st.warning(f"Could not parse date for transaction: {transaction}")
545
  continue
546
 
547
  if not filtered_transactions:
548
- st.warning("No transactions found within the selected date range.")
549
  else:
 
550
  try:
551
  model1 = configure_gemini1(api_key)
552
  combined_json = {"transactions": filtered_transactions}
553
  with st.spinner("Generating financial report..."):
554
  report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
555
  if report_text:
556
- st.success("Financial report generated!")
557
 
558
  # Display the report as markdown
559
  st.markdown("### Financial Report Preview")
@@ -561,6 +605,7 @@ def main():
561
 
562
  # Create PDF from markdown
563
  try:
 
564
  pdf_buffer = create_pdf_report(report_text)
565
  st.download_button(
566
  label="Download Financial Report as PDF",
@@ -568,20 +613,24 @@ def main():
568
  file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
569
  mime="application/pdf"
570
  )
 
571
  except Exception as e:
572
- st.error(f"Error generating PDF: {str(e)}")
573
  st.info("For better PDF generation, please ensure NotoSans fonts are installed in the same directory.")
 
574
  except exceptions.ServiceUnavailable as e:
575
  if e.response.status_code == 504:
576
  st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
577
  else:
578
- st.error(f"Error generating financial report: {str(e)}")
 
579
  except Exception as e:
580
- st.error(f"Error generating financial report: {str(e)}")
581
  if "504" in str(e):
582
  st.info("The Gemini API might be overloaded. Consider generating reports for smaller time periods.")
583
  elif len(filtered_transactions) > 500:
584
  st.info("For large datasets, consider generating reports for smaller time periods.")
 
585
 
586
  if __name__ == "__main__":
587
  main()
 
19
  api_key = os.getenv('Gemini')
20
 
21
  def configure_gemini(api_key):
22
+ st.info("Configuring Gemini API for transaction extraction...") # Log
23
  genai.configure(api_key=api_key)
24
  return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
25
 
26
  def configure_gemini1(api_key):
27
+ st.info("Configuring Gemini API for report generation...") # Log
28
  genai.configure(api_key=api_key)
29
  return genai.GenerativeModel('gemini-2.5-flash')
30
 
31
  # Read PDF content page by page from a file-like object
32
  def read_pdf_pages(file_obj):
33
+ st.info(f"Reading PDF pages from {file_obj.name}...") # Log
34
  file_obj.seek(0) # Ensure the file pointer is at the start
35
  pdf_reader = pypdf.PdfReader(file_obj)
36
  total_pages = len(pdf_reader.pages)
37
+ st.info(f"Found {total_pages} pages in PDF.") # Log
38
  return pdf_reader, total_pages
39
 
40
  # Extract text from a specific page
41
  def extract_page_text(pdf_reader, page_num):
42
+ # st.debug(f"Extracting text from page {page_num + 1}...") # Too verbose for general logging
43
  if page_num < len(pdf_reader.pages):
44
  text = pdf_reader.pages[page_num].extract_text()
45
+ if not text.strip():
46
+ st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.") # Log empty pages
47
  return text if text else ""
48
  return ""
49
 
 
74
  ]
75
  }"""
76
  try:
77
+ # st.debug("Sending text chunk to Gemini for transaction extraction...") # Too verbose
78
  response = model.generate_content([prompt, text])
79
  time.sleep(6) # Sleep for 6 seconds to work around rate limit
80
+ # st.debug("Received response from Gemini for transaction extraction.") # Too verbose
81
  return response.text
82
  except exceptions.ServiceUnavailable as e:
83
  if e.response.status_code == 504:
84
  st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
85
  return None
86
  else:
87
+ st.error(f"Gemini API error during transaction extraction: {e}") # Log other API errors
88
  raise
89
+ except Exception as e:
90
+ st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}") # Catch other potential errors
91
+ return None
92
 
93
  # Process PDF page by page to handle large files
94
  def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
95
  all_transactions = []
96
+ st.info(f"Starting page-by-page PDF processing for {total_pages} pages...") # Log
97
 
98
  # Process pages individually or in small chunks
99
  for page_num in range(total_pages):
 
105
  page_text = extract_page_text(pdf_reader, page_num)
106
 
107
  if not page_text.strip():
108
+ st.warning(f"Skipping empty or unreadable page {page_num + 1}.") # Log skipped pages
109
  continue # Skip empty pages
110
 
111
  # Process the page with Gemini
112
+ st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...") # Log
113
  json_response = process_with_gemini(model, page_text)
114
 
115
  if json_response:
 
118
  end_idx = json_response.rfind('}') + 1
119
 
120
  if start_idx == -1 or end_idx == 0:
121
+ st.warning(f"No valid JSON found in Gemini response for page {page_num + 1}. Raw response: {json_response[:200]}...") # Log invalid JSON structure
122
  continue # Skip invalid JSON
123
 
124
  json_str = json_response[start_idx:end_idx]
 
127
  try:
128
  data = json.loads(json_str)
129
  transactions = data.get('transactions', [])
130
+ if transactions:
131
+ st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.") # Log successful extraction
132
+ all_transactions.extend(transactions)
133
+ else:
134
+ st.info(f"No transactions found on page {page_num + 1} based on Gemini's analysis.") # Log no transactions found on page
135
  except json.JSONDecodeError:
136
+ st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}. Check response format. Raw JSON snippet: {json_str[:200]}...") # Log JSON decode errors
137
  continue # Skip invalid JSON
138
+ else:
139
+ st.warning(f"Gemini returned no response for page {page_num + 1}. This page's transactions might be missing.") # Log no response from Gemini
140
+
141
+ st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.") # Final log for extraction
142
  return all_transactions
143
 
144
  # Generate financial report from aggregated JSON transactions and chosen parameters
145
  def generate_financial_report(model, json_data, start_date, end_date, statement_type):
146
+ st.info(f"Preparing prompt for Gemini to generate {statement_type} report from {start_date} to {end_date}...") # Log
147
  prompt = f"""Based on the following transactions JSON data:
148
  {json.dumps(json_data)}
149
  Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
 
161
  Format the report in Markdown for better visual structure.
162
  Do not name the company if name is not there and return just the report and nothing else."""
163
  try:
164
+ st.info("Sending request to Gemini for financial report generation...") # Log
165
  response = model.generate_content([prompt])
166
  time.sleep(7) # Sleep for 7 seconds to work around rate limit
167
+ st.success("Successfully received financial report from Gemini.") # Log success
168
  return response.text
169
  except exceptions.ServiceUnavailable as e:
170
  if e.response.status_code == 504:
 
172
  st.session_state['last_error'] = "504" # Store the error in session state
173
  return None
174
  else:
175
+ st.error(f"Gemini API error during report generation: {e}") # Log other API errors
176
  raise
177
+ except Exception as e:
178
+ st.error(f"An unexpected error occurred during Gemini report generation: {e}") # Catch other potential errors
179
+ return None
180
 
181
 
182
  # Install required libraries:
 
342
  Exception: If PDF generation fails.
343
  """
344
  if not report_text:
345
+ st.warning("Report text is empty, skipping PDF generation.") # Log
346
  raise ValueError("Input report_text cannot be empty.")
347
 
348
  try:
349
+ st.info("Starting PDF generation from markdown report...") # Log
350
  # 1. Clean Markdown
351
  cleaned_md = re.sub(r'^```markdown\s*', '', report_text, flags=re.MULTILINE)
352
  cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
353
  cleaned_md = cleaned_md.strip()
354
+ # st.debug("Markdown cleaned.") # Too verbose
355
 
356
  # 2. Convert Markdown to HTML
357
  html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
358
  if not html_content:
359
+ st.error("Markdown parsing resulted in empty HTML.") # Log
360
  raise ValueError("Markdown parsing resulted in empty HTML.")
361
+ # st.debug("Markdown converted to HTML.") # Too verbose
362
 
363
  # 3. Parse HTML with BeautifulSoup
364
  soup = BeautifulSoup(html_content, 'html.parser')
365
+ # st.debug("HTML parsed with BeautifulSoup.") # Too verbose
366
 
367
  # 4. Generate PDF using FPDF
368
  pdf = PDF_Generator()
369
  pdf.add_page()
370
  pdf.set_font('helvetica', '', 10) # Default font
371
+ st.info("PDF document initialized, adding content...") # Log
372
 
373
  # Iterate through top-level tags in the HTML body
374
  for element in soup.find_all(recursive=False):
 
 
375
  styles = set()
376
  def traverse(tag, current_styles):
 
377
  local_style_added = None
378
  if tag.name in ['b', 'strong']:
379
  current_styles.add('b')
 
382
  current_styles.add('i')
383
  local_style_added = 'i'
384
 
 
 
385
  if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
386
+ pdf.add_html_element(tag, current_styles.copy())
387
  else:
 
388
  if hasattr(tag, 'contents'):
389
  for child in tag.contents:
390
+ if isinstance(child, str):
391
+ pass
392
+ elif hasattr(child, 'name'):
393
+ traverse(child, current_styles.copy())
394
 
 
395
  if local_style_added and local_style_added in current_styles:
396
  current_styles.remove(local_style_added)
397
 
398
  traverse(element, styles)
399
 
400
+ st.info("Content added to PDF. Outputting PDF to buffer...") # Log
401
  # 5. Output PDF to BytesIO buffer
402
  pdf_output = pdf.output(dest='S') # Output as bytes string
403
  if isinstance(pdf_output, str):
404
  # If output is string (older fpdf versions?), encode it
405
  pdf_output = pdf_output.encode('latin-1')
406
 
407
+ st.success("PDF report generated successfully.") # Log success
408
  return BytesIO(pdf_output)
409
 
410
  except ImportError:
 
413
  except Exception as e:
414
  st.error(f"Failed to generate PDF locally using FPDF: {type(e).__name__}: {e}")
415
  st.exception(e) # Show traceback in streamlit logs
 
 
 
 
 
416
  raise Exception(f"Local FPDF PDF generation failed: {e}") from e
417
 
418
 
419
  def main():
420
  st.title("Quantitlytix AI")
421
  st.markdown("*Bank Statement Parser & Financial Report Generator*")
422
+ st.info("Application started. Ready for user input.") # Log app start
423
 
424
  # Initialize session state for last error
425
  if 'last_error' not in st.session_state:
 
433
 
434
  # Sidebar: Select input type: Bulk PDF or CSV Upload
435
  input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
436
+ st.info(f"Input type selected: {input_type}") # Log input type
437
 
438
  all_transactions = []
439
 
440
  if input_type == "Bulk Bank Statement Upload":
441
  uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
442
  if uploaded_files:
443
+ st.info(f"User uploaded {len(uploaded_files)} PDF file(s).") # Log file upload
444
  total_files = len(uploaded_files)
445
  st.write(f"{total_files} PDF file(s) uploaded.")
446
  try:
 
452
 
453
  file_progress = 0
454
  for file_index, uploaded_file in enumerate(uploaded_files):
455
+ st.info(f"Starting processing for file {file_index+1}/{total_files}: {uploaded_file.name}") # Log individual file start
456
  # Update file progress
457
  file_progress = (file_index) / total_files
458
+ progress_bar.progress(overall_progress) # Corrected variable name
459
  status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
460
 
461
  # Get PDF reader and page count
 
463
  pdf_reader, total_pages = read_pdf_pages(uploaded_file)
464
 
465
  if total_pages == 0:
466
+ st.warning(f"No pages found in {uploaded_file.name}. Skipping file.") # Log
467
  continue
468
 
469
  with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
 
475
  status_text.text(f"File {file_index+1}/{total_files}: {message}")
476
 
477
  # Process the PDF page by page
478
+ st.info(f"Calling process_pdf_pages for {uploaded_file.name}...") # Log
479
  file_transactions = process_pdf_pages(
480
  model,
481
  pdf_reader,
 
485
 
486
  # Add transactions from this file to overall list
487
  all_transactions.extend(file_transactions)
488
+ st.info(f"Finished processing {uploaded_file.name}. Extracted {len(file_transactions)} transactions.") # Log file completion
489
 
490
  except Exception as e:
491
+ st.error(f"Error processing {uploaded_file.name}: {str(e)}") # Log specific file error
492
+ st.exception(e) # Show traceback
493
  continue
494
 
495
  # Complete the progress bar
496
  progress_bar.progress(1.0)
497
  status_text.text(f"Completed processing {total_files} files!")
498
+ st.success(f"All PDF files processed. Total transactions collected: {len(all_transactions)}.") # Log overall completion
499
 
500
  except Exception as e:
501
+ st.error(f"Overall error during PDF document processing: {str(e)}") # Log general error during PDF handling
502
  st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
503
+ st.exception(e) # Show traceback
504
  elif input_type == "CSV Upload":
505
  uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
506
  if uploaded_csv:
507
+ st.info(f"User uploaded CSV file: {uploaded_csv.name}.") # Log
508
  try:
509
  df = pd.read_csv(uploaded_csv)
510
  # Drop 'Unnamed:' columns from the uploaded CSV
 
514
  # Convert dataframe to list of transaction dictionaries
515
  transactions = df.to_dict(orient='records')
516
  all_transactions.extend(transactions)
517
+ st.success(f"Successfully loaded {len(transactions)} transactions from CSV.") # Log
518
  except Exception as e:
519
+ st.error(f"Error processing CSV file: {str(e)}") # Log CSV error
520
+ st.exception(e)
521
 
522
  # If transactions are loaded, show DataFrame and update date ranges
523
  if all_transactions:
524
+ st.info("Consolidating and displaying all extracted transactions.") # Log
525
  df = pd.DataFrame(all_transactions)
526
  # Drop 'Unnamed:' columns from the final DataFrame
527
  if not df.empty:
528
  df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
529
  try:
 
 
530
  # Process dates and extract min/max dates for date range inputs
531
+ st.info("Parsing transaction dates and determining date range.") # Log
532
  df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
533
 
534
  # Get min and max dates from transactions
 
539
  # Update session state with actual transaction date range
540
  st.session_state['min_date'] = min_date
541
  st.session_state['max_date'] = max_date
542
+ st.info(f"Determined transaction date range: {min_date} to {max_date}.") # Log
543
+ else:
544
+ st.warning("Could not determine valid date range from transactions. Using default dates.") # Log
545
 
546
  # Format dates for display
547
  df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
548
 
549
  except Exception as e:
550
+ st.warning("Some transaction dates could not be formatted correctly.")
551
  st.exception(e)
552
 
553
  st.success("Transactions loaded successfully!")
 
556
  else:
557
  st.warning("No valid transactions could be extracted from the documents.")
558
  else:
559
+ st.info("No transactions loaded yet. Upload files to begin.") # Initial state log
560
 
561
  # Financial report generation parameters
562
  st.write("### Generate Financial Report")
 
571
  statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
572
 
573
  if st.button("Generate Financial Report"):
574
+ st.info(f"User clicked 'Generate Financial Report' for {statement_type} from {start_date} to {end_date}.") # Log button click
575
  if not all_transactions:
576
+ st.error("No transactions available to generate report. Please upload files first.") # Log
577
  else:
578
  # Filter transactions by date
579
+ st.info(f"Filtering {len(all_transactions)} transactions for the period {start_date} to {end_date}...") # Log filtering
580
  filtered_transactions = []
581
  for transaction in all_transactions:
582
  try:
 
584
  if start_date <= transaction_date <= end_date:
585
  filtered_transactions.append(transaction)
586
  except (ValueError, TypeError):
587
+ st.warning(f"Could not parse date for transaction, skipping: {transaction}") # Log problematic transactions
588
  continue
589
 
590
  if not filtered_transactions:
591
+ st.warning("No transactions found within the selected date range. Please adjust dates or upload relevant files.") # Log
592
  else:
593
+ st.info(f"Found {len(filtered_transactions)} transactions within the selected date range.") # Log filtered count
594
  try:
595
  model1 = configure_gemini1(api_key)
596
  combined_json = {"transactions": filtered_transactions}
597
  with st.spinner("Generating financial report..."):
598
  report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
599
  if report_text:
600
+ st.success("Financial report generated successfully by Gemini!") # Log report text ready
601
 
602
  # Display the report as markdown
603
  st.markdown("### Financial Report Preview")
 
605
 
606
  # Create PDF from markdown
607
  try:
608
+ st.info("Attempting to generate PDF from the report markdown.") # Log PDF start
609
  pdf_buffer = create_pdf_report(report_text)
610
  st.download_button(
611
  label="Download Financial Report as PDF",
 
613
  file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
614
  mime="application/pdf"
615
  )
616
+ st.success("PDF download button enabled.") # Log
617
  except Exception as e:
618
+ st.error(f"Error generating PDF for download: {str(e)}") # Log PDF error
619
  st.info("For better PDF generation, please ensure NotoSans fonts are installed in the same directory.")
620
+ st.exception(e) # Show traceback
621
  except exceptions.ServiceUnavailable as e:
622
  if e.response.status_code == 504:
623
  st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
624
  else:
625
+ st.error(f"Error generating financial report due to Gemini API issue: {str(e)}") # Log API error
626
+ st.exception(e) # Show traceback
627
  except Exception as e:
628
+ st.error(f"An unexpected error occurred while generating the financial report: {str(e)}") # Log general error
629
  if "504" in str(e):
630
  st.info("The Gemini API might be overloaded. Consider generating reports for smaller time periods.")
631
  elif len(filtered_transactions) > 500:
632
  st.info("For large datasets, consider generating reports for smaller time periods.")
633
+ st.exception(e) # Show traceback
634
 
635
  if __name__ == "__main__":
636
  main()