Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,24 +19,31 @@ from html_to_markdown import convert_to_markdown
|
|
| 19 |
api_key = os.getenv('Gemini')
|
| 20 |
|
| 21 |
def configure_gemini(api_key):
|
|
|
|
| 22 |
genai.configure(api_key=api_key)
|
| 23 |
return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
|
| 24 |
|
| 25 |
def configure_gemini1(api_key):
|
|
|
|
| 26 |
genai.configure(api_key=api_key)
|
| 27 |
return genai.GenerativeModel('gemini-2.5-flash')
|
| 28 |
|
| 29 |
# Read PDF content page by page from a file-like object
|
| 30 |
def read_pdf_pages(file_obj):
|
|
|
|
| 31 |
file_obj.seek(0) # Ensure the file pointer is at the start
|
| 32 |
pdf_reader = pypdf.PdfReader(file_obj)
|
| 33 |
total_pages = len(pdf_reader.pages)
|
|
|
|
| 34 |
return pdf_reader, total_pages
|
| 35 |
|
| 36 |
# Extract text from a specific page
|
| 37 |
def extract_page_text(pdf_reader, page_num):
|
|
|
|
| 38 |
if page_num < len(pdf_reader.pages):
|
| 39 |
text = pdf_reader.pages[page_num].extract_text()
|
|
|
|
|
|
|
| 40 |
return text if text else ""
|
| 41 |
return ""
|
| 42 |
|
|
@@ -67,19 +74,26 @@ def process_with_gemini(model, text):
|
|
| 67 |
]
|
| 68 |
}"""
|
| 69 |
try:
|
|
|
|
| 70 |
response = model.generate_content([prompt, text])
|
| 71 |
time.sleep(6) # Sleep for 6 seconds to work around rate limit
|
|
|
|
| 72 |
return response.text
|
| 73 |
except exceptions.ServiceUnavailable as e:
|
| 74 |
if e.response.status_code == 504:
|
| 75 |
st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
|
| 76 |
return None
|
| 77 |
else:
|
|
|
|
| 78 |
raise
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
# Process PDF page by page to handle large files
|
| 81 |
def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
| 82 |
all_transactions = []
|
|
|
|
| 83 |
|
| 84 |
# Process pages individually or in small chunks
|
| 85 |
for page_num in range(total_pages):
|
|
@@ -91,9 +105,11 @@ def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
|
| 91 |
page_text = extract_page_text(pdf_reader, page_num)
|
| 92 |
|
| 93 |
if not page_text.strip():
|
|
|
|
| 94 |
continue # Skip empty pages
|
| 95 |
|
| 96 |
# Process the page with Gemini
|
|
|
|
| 97 |
json_response = process_with_gemini(model, page_text)
|
| 98 |
|
| 99 |
if json_response:
|
|
@@ -102,6 +118,7 @@ def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
|
| 102 |
end_idx = json_response.rfind('}') + 1
|
| 103 |
|
| 104 |
if start_idx == -1 or end_idx == 0:
|
|
|
|
| 105 |
continue # Skip invalid JSON
|
| 106 |
|
| 107 |
json_str = json_response[start_idx:end_idx]
|
|
@@ -110,16 +127,23 @@ def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
|
| 110 |
try:
|
| 111 |
data = json.loads(json_str)
|
| 112 |
transactions = data.get('transactions', [])
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
| 116 |
except json.JSONDecodeError:
|
|
|
|
| 117 |
continue # Skip invalid JSON
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
| 119 |
return all_transactions
|
| 120 |
|
| 121 |
# Generate financial report from aggregated JSON transactions and chosen parameters
|
| 122 |
def generate_financial_report(model, json_data, start_date, end_date, statement_type):
|
|
|
|
| 123 |
prompt = f"""Based on the following transactions JSON data:
|
| 124 |
{json.dumps(json_data)}
|
| 125 |
Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
|
|
@@ -137,8 +161,10 @@ Concise Summary: Provide a concluding summary paragraph that encapsulates the ov
|
|
| 137 |
Format the report in Markdown for better visual structure.
|
| 138 |
Do not name the company if name is not there and return just the report and nothing else."""
|
| 139 |
try:
|
|
|
|
| 140 |
response = model.generate_content([prompt])
|
| 141 |
time.sleep(7) # Sleep for 7 seconds to work around rate limit
|
|
|
|
| 142 |
return response.text
|
| 143 |
except exceptions.ServiceUnavailable as e:
|
| 144 |
if e.response.status_code == 504:
|
|
@@ -146,7 +172,11 @@ Do not name the company if name is not there and return just the report and noth
|
|
| 146 |
st.session_state['last_error'] = "504" # Store the error in session state
|
| 147 |
return None
|
| 148 |
else:
|
|
|
|
| 149 |
raise
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
# Install required libraries:
|
|
@@ -312,34 +342,38 @@ def create_pdf_report(report_text):
|
|
| 312 |
Exception: If PDF generation fails.
|
| 313 |
"""
|
| 314 |
if not report_text:
|
|
|
|
| 315 |
raise ValueError("Input report_text cannot be empty.")
|
| 316 |
|
| 317 |
try:
|
|
|
|
| 318 |
# 1. Clean Markdown
|
| 319 |
cleaned_md = re.sub(r'^```markdown\s*', '', report_text, flags=re.MULTILINE)
|
| 320 |
cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
|
| 321 |
cleaned_md = cleaned_md.strip()
|
|
|
|
| 322 |
|
| 323 |
# 2. Convert Markdown to HTML
|
| 324 |
html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
|
| 325 |
if not html_content:
|
|
|
|
| 326 |
raise ValueError("Markdown parsing resulted in empty HTML.")
|
|
|
|
| 327 |
|
| 328 |
# 3. Parse HTML with BeautifulSoup
|
| 329 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
| 330 |
|
| 331 |
# 4. Generate PDF using FPDF
|
| 332 |
pdf = PDF_Generator()
|
| 333 |
pdf.add_page()
|
| 334 |
pdf.set_font('helvetica', '', 10) # Default font
|
|
|
|
| 335 |
|
| 336 |
# Iterate through top-level tags in the HTML body
|
| 337 |
for element in soup.find_all(recursive=False):
|
| 338 |
-
# Track basic nested styles like bold/italic
|
| 339 |
-
# This is very basic and might not handle complex nesting well
|
| 340 |
styles = set()
|
| 341 |
def traverse(tag, current_styles):
|
| 342 |
-
# Check for styling tags
|
| 343 |
local_style_added = None
|
| 344 |
if tag.name in ['b', 'strong']:
|
| 345 |
current_styles.add('b')
|
|
@@ -348,32 +382,29 @@ def create_pdf_report(report_text):
|
|
| 348 |
current_styles.add('i')
|
| 349 |
local_style_added = 'i'
|
| 350 |
|
| 351 |
-
# If it's a text node, process it with current styles (handled within add_html_element)
|
| 352 |
-
# If it's a known block element, process it
|
| 353 |
if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
|
| 354 |
-
pdf.add_html_element(tag, current_styles.copy())
|
| 355 |
else:
|
| 356 |
-
# Recursively process children for other tags (like span, div, or styling tags)
|
| 357 |
if hasattr(tag, 'contents'):
|
| 358 |
for child in tag.contents:
|
| 359 |
-
if isinstance(child, str):
|
| 360 |
-
pass
|
| 361 |
-
elif hasattr(child, 'name'):
|
| 362 |
-
traverse(child, current_styles.copy())
|
| 363 |
|
| 364 |
-
# Remove local style after processing the tag and its children
|
| 365 |
if local_style_added and local_style_added in current_styles:
|
| 366 |
current_styles.remove(local_style_added)
|
| 367 |
|
| 368 |
traverse(element, styles)
|
| 369 |
|
| 370 |
-
|
| 371 |
# 5. Output PDF to BytesIO buffer
|
| 372 |
pdf_output = pdf.output(dest='S') # Output as bytes string
|
| 373 |
if isinstance(pdf_output, str):
|
| 374 |
# If output is string (older fpdf versions?), encode it
|
| 375 |
pdf_output = pdf_output.encode('latin-1')
|
| 376 |
|
|
|
|
| 377 |
return BytesIO(pdf_output)
|
| 378 |
|
| 379 |
except ImportError:
|
|
@@ -382,17 +413,13 @@ def create_pdf_report(report_text):
|
|
| 382 |
except Exception as e:
|
| 383 |
st.error(f"Failed to generate PDF locally using FPDF: {type(e).__name__}: {e}")
|
| 384 |
st.exception(e) # Show traceback in streamlit logs
|
| 385 |
-
# Log intermediate steps if possible
|
| 386 |
-
# print("--- Cleaned Markdown ---")
|
| 387 |
-
# print(cleaned_md)
|
| 388 |
-
# print("--- Generated HTML ---")
|
| 389 |
-
# print(html_content)
|
| 390 |
raise Exception(f"Local FPDF PDF generation failed: {e}") from e
|
| 391 |
|
| 392 |
|
| 393 |
def main():
|
| 394 |
st.title("Quantitlytix AI")
|
| 395 |
st.markdown("*Bank Statement Parser & Financial Report Generator*")
|
|
|
|
| 396 |
|
| 397 |
# Initialize session state for last error
|
| 398 |
if 'last_error' not in st.session_state:
|
|
@@ -406,12 +433,14 @@ def main():
|
|
| 406 |
|
| 407 |
# Sidebar: Select input type: Bulk PDF or CSV Upload
|
| 408 |
input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
|
|
|
|
| 409 |
|
| 410 |
all_transactions = []
|
| 411 |
|
| 412 |
if input_type == "Bulk Bank Statement Upload":
|
| 413 |
uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
|
| 414 |
if uploaded_files:
|
|
|
|
| 415 |
total_files = len(uploaded_files)
|
| 416 |
st.write(f"{total_files} PDF file(s) uploaded.")
|
| 417 |
try:
|
|
@@ -423,9 +452,10 @@ def main():
|
|
| 423 |
|
| 424 |
file_progress = 0
|
| 425 |
for file_index, uploaded_file in enumerate(uploaded_files):
|
|
|
|
| 426 |
# Update file progress
|
| 427 |
file_progress = (file_index) / total_files
|
| 428 |
-
progress_bar.progress(
|
| 429 |
status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
|
| 430 |
|
| 431 |
# Get PDF reader and page count
|
|
@@ -433,7 +463,7 @@ def main():
|
|
| 433 |
pdf_reader, total_pages = read_pdf_pages(uploaded_file)
|
| 434 |
|
| 435 |
if total_pages == 0:
|
| 436 |
-
st.warning(f"No pages found in {uploaded_file.name}.")
|
| 437 |
continue
|
| 438 |
|
| 439 |
with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
|
|
@@ -445,6 +475,7 @@ def main():
|
|
| 445 |
status_text.text(f"File {file_index+1}/{total_files}: {message}")
|
| 446 |
|
| 447 |
# Process the PDF page by page
|
|
|
|
| 448 |
file_transactions = process_pdf_pages(
|
| 449 |
model,
|
| 450 |
pdf_reader,
|
|
@@ -454,21 +485,26 @@ def main():
|
|
| 454 |
|
| 455 |
# Add transactions from this file to overall list
|
| 456 |
all_transactions.extend(file_transactions)
|
|
|
|
| 457 |
|
| 458 |
except Exception as e:
|
| 459 |
-
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
|
|
|
| 460 |
continue
|
| 461 |
|
| 462 |
# Complete the progress bar
|
| 463 |
progress_bar.progress(1.0)
|
| 464 |
status_text.text(f"Completed processing {total_files} files!")
|
|
|
|
| 465 |
|
| 466 |
except Exception as e:
|
| 467 |
-
st.error(f"
|
| 468 |
st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
|
|
|
|
| 469 |
elif input_type == "CSV Upload":
|
| 470 |
uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
|
| 471 |
if uploaded_csv:
|
|
|
|
| 472 |
try:
|
| 473 |
df = pd.read_csv(uploaded_csv)
|
| 474 |
# Drop 'Unnamed:' columns from the uploaded CSV
|
|
@@ -478,19 +514,21 @@ def main():
|
|
| 478 |
# Convert dataframe to list of transaction dictionaries
|
| 479 |
transactions = df.to_dict(orient='records')
|
| 480 |
all_transactions.extend(transactions)
|
|
|
|
| 481 |
except Exception as e:
|
| 482 |
-
st.error(f"Error processing CSV file: {str(e)}")
|
|
|
|
| 483 |
|
| 484 |
# If transactions are loaded, show DataFrame and update date ranges
|
| 485 |
if all_transactions:
|
|
|
|
| 486 |
df = pd.DataFrame(all_transactions)
|
| 487 |
# Drop 'Unnamed:' columns from the final DataFrame
|
| 488 |
if not df.empty:
|
| 489 |
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
|
| 490 |
try:
|
| 491 |
-
|
| 492 |
-
|
| 493 |
# Process dates and extract min/max dates for date range inputs
|
|
|
|
| 494 |
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
|
| 495 |
|
| 496 |
# Get min and max dates from transactions
|
|
@@ -501,12 +539,15 @@ def main():
|
|
| 501 |
# Update session state with actual transaction date range
|
| 502 |
st.session_state['min_date'] = min_date
|
| 503 |
st.session_state['max_date'] = max_date
|
|
|
|
|
|
|
|
|
|
| 504 |
|
| 505 |
# Format dates for display
|
| 506 |
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
|
| 507 |
|
| 508 |
except Exception as e:
|
| 509 |
-
st.warning("Some
|
| 510 |
st.exception(e)
|
| 511 |
|
| 512 |
st.success("Transactions loaded successfully!")
|
|
@@ -515,7 +556,7 @@ def main():
|
|
| 515 |
else:
|
| 516 |
st.warning("No valid transactions could be extracted from the documents.")
|
| 517 |
else:
|
| 518 |
-
st.info("No transactions loaded yet.")
|
| 519 |
|
| 520 |
# Financial report generation parameters
|
| 521 |
st.write("### Generate Financial Report")
|
|
@@ -530,10 +571,12 @@ def main():
|
|
| 530 |
statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
|
| 531 |
|
| 532 |
if st.button("Generate Financial Report"):
|
|
|
|
| 533 |
if not all_transactions:
|
| 534 |
-
st.error("No transactions available to generate report.")
|
| 535 |
else:
|
| 536 |
# Filter transactions by date
|
|
|
|
| 537 |
filtered_transactions = []
|
| 538 |
for transaction in all_transactions:
|
| 539 |
try:
|
|
@@ -541,19 +584,20 @@ def main():
|
|
| 541 |
if start_date <= transaction_date <= end_date:
|
| 542 |
filtered_transactions.append(transaction)
|
| 543 |
except (ValueError, TypeError):
|
| 544 |
-
st.warning(f"Could not parse date for transaction: {transaction}")
|
| 545 |
continue
|
| 546 |
|
| 547 |
if not filtered_transactions:
|
| 548 |
-
st.warning("No transactions found within the selected date range.")
|
| 549 |
else:
|
|
|
|
| 550 |
try:
|
| 551 |
model1 = configure_gemini1(api_key)
|
| 552 |
combined_json = {"transactions": filtered_transactions}
|
| 553 |
with st.spinner("Generating financial report..."):
|
| 554 |
report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
|
| 555 |
if report_text:
|
| 556 |
-
st.success("Financial report generated!")
|
| 557 |
|
| 558 |
# Display the report as markdown
|
| 559 |
st.markdown("### Financial Report Preview")
|
|
@@ -561,6 +605,7 @@ def main():
|
|
| 561 |
|
| 562 |
# Create PDF from markdown
|
| 563 |
try:
|
|
|
|
| 564 |
pdf_buffer = create_pdf_report(report_text)
|
| 565 |
st.download_button(
|
| 566 |
label="Download Financial Report as PDF",
|
|
@@ -568,20 +613,24 @@ def main():
|
|
| 568 |
file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
|
| 569 |
mime="application/pdf"
|
| 570 |
)
|
|
|
|
| 571 |
except Exception as e:
|
| 572 |
-
st.error(f"Error generating PDF: {str(e)}")
|
| 573 |
st.info("For better PDF generation, please ensure NotoSans fonts are installed in the same directory.")
|
|
|
|
| 574 |
except exceptions.ServiceUnavailable as e:
|
| 575 |
if e.response.status_code == 504:
|
| 576 |
st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
|
| 577 |
else:
|
| 578 |
-
st.error(f"Error generating financial report: {str(e)}")
|
|
|
|
| 579 |
except Exception as e:
|
| 580 |
-
st.error(f"
|
| 581 |
if "504" in str(e):
|
| 582 |
st.info("The Gemini API might be overloaded. Consider generating reports for smaller time periods.")
|
| 583 |
elif len(filtered_transactions) > 500:
|
| 584 |
st.info("For large datasets, consider generating reports for smaller time periods.")
|
|
|
|
| 585 |
|
| 586 |
if __name__ == "__main__":
|
| 587 |
main()
|
|
|
|
| 19 |
api_key = os.getenv('Gemini')
|
| 20 |
|
| 21 |
def configure_gemini(api_key):
|
| 22 |
+
st.info("Configuring Gemini API for transaction extraction...") # Log
|
| 23 |
genai.configure(api_key=api_key)
|
| 24 |
return genai.GenerativeModel('gemini-2.0-flash-thinking-exp')
|
| 25 |
|
| 26 |
def configure_gemini1(api_key):
|
| 27 |
+
st.info("Configuring Gemini API for report generation...") # Log
|
| 28 |
genai.configure(api_key=api_key)
|
| 29 |
return genai.GenerativeModel('gemini-2.5-flash')
|
| 30 |
|
| 31 |
# Read PDF content page by page from a file-like object
|
| 32 |
def read_pdf_pages(file_obj):
|
| 33 |
+
st.info(f"Reading PDF pages from {file_obj.name}...") # Log
|
| 34 |
file_obj.seek(0) # Ensure the file pointer is at the start
|
| 35 |
pdf_reader = pypdf.PdfReader(file_obj)
|
| 36 |
total_pages = len(pdf_reader.pages)
|
| 37 |
+
st.info(f"Found {total_pages} pages in PDF.") # Log
|
| 38 |
return pdf_reader, total_pages
|
| 39 |
|
| 40 |
# Extract text from a specific page
|
| 41 |
def extract_page_text(pdf_reader, page_num):
|
| 42 |
+
# st.debug(f"Extracting text from page {page_num + 1}...") # Too verbose for general logging
|
| 43 |
if page_num < len(pdf_reader.pages):
|
| 44 |
text = pdf_reader.pages[page_num].extract_text()
|
| 45 |
+
if not text.strip():
|
| 46 |
+
st.warning(f"Page {page_num + 1} appears to be empty or contains no extractable text.") # Log empty pages
|
| 47 |
return text if text else ""
|
| 48 |
return ""
|
| 49 |
|
|
|
|
| 74 |
]
|
| 75 |
}"""
|
| 76 |
try:
|
| 77 |
+
# st.debug("Sending text chunk to Gemini for transaction extraction...") # Too verbose
|
| 78 |
response = model.generate_content([prompt, text])
|
| 79 |
time.sleep(6) # Sleep for 6 seconds to work around rate limit
|
| 80 |
+
# st.debug("Received response from Gemini for transaction extraction.") # Too verbose
|
| 81 |
return response.text
|
| 82 |
except exceptions.ServiceUnavailable as e:
|
| 83 |
if e.response.status_code == 504:
|
| 84 |
st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
|
| 85 |
return None
|
| 86 |
else:
|
| 87 |
+
st.error(f"Gemini API error during transaction extraction: {e}") # Log other API errors
|
| 88 |
raise
|
| 89 |
+
except Exception as e:
|
| 90 |
+
st.error(f"An unexpected error occurred during Gemini transaction extraction: {e}") # Catch other potential errors
|
| 91 |
+
return None
|
| 92 |
|
| 93 |
# Process PDF page by page to handle large files
|
| 94 |
def process_pdf_pages(model, pdf_reader, total_pages, progress_callback=None):
|
| 95 |
all_transactions = []
|
| 96 |
+
st.info(f"Starting page-by-page PDF processing for {total_pages} pages...") # Log
|
| 97 |
|
| 98 |
# Process pages individually or in small chunks
|
| 99 |
for page_num in range(total_pages):
|
|
|
|
| 105 |
page_text = extract_page_text(pdf_reader, page_num)
|
| 106 |
|
| 107 |
if not page_text.strip():
|
| 108 |
+
st.warning(f"Skipping empty or unreadable page {page_num + 1}.") # Log skipped pages
|
| 109 |
continue # Skip empty pages
|
| 110 |
|
| 111 |
# Process the page with Gemini
|
| 112 |
+
st.info(f"Sending page {page_num + 1} text to Gemini for transaction extraction...") # Log
|
| 113 |
json_response = process_with_gemini(model, page_text)
|
| 114 |
|
| 115 |
if json_response:
|
|
|
|
| 118 |
end_idx = json_response.rfind('}') + 1
|
| 119 |
|
| 120 |
if start_idx == -1 or end_idx == 0:
|
| 121 |
+
st.warning(f"No valid JSON found in Gemini response for page {page_num + 1}. Raw response: {json_response[:200]}...") # Log invalid JSON structure
|
| 122 |
continue # Skip invalid JSON
|
| 123 |
|
| 124 |
json_str = json_response[start_idx:end_idx]
|
|
|
|
| 127 |
try:
|
| 128 |
data = json.loads(json_str)
|
| 129 |
transactions = data.get('transactions', [])
|
| 130 |
+
if transactions:
|
| 131 |
+
st.info(f"Successfully extracted {len(transactions)} transactions from page {page_num + 1}.") # Log successful extraction
|
| 132 |
+
all_transactions.extend(transactions)
|
| 133 |
+
else:
|
| 134 |
+
st.info(f"No transactions found on page {page_num + 1} based on Gemini's analysis.") # Log no transactions found on page
|
| 135 |
except json.JSONDecodeError:
|
| 136 |
+
st.error(f"Failed to decode JSON from Gemini response for page {page_num + 1}. Check response format. Raw JSON snippet: {json_str[:200]}...") # Log JSON decode errors
|
| 137 |
continue # Skip invalid JSON
|
| 138 |
+
else:
|
| 139 |
+
st.warning(f"Gemini returned no response for page {page_num + 1}. This page's transactions might be missing.") # Log no response from Gemini
|
| 140 |
+
|
| 141 |
+
st.info(f"Finished processing all pages. Total transactions extracted: {len(all_transactions)}.") # Final log for extraction
|
| 142 |
return all_transactions
|
| 143 |
|
| 144 |
# Generate financial report from aggregated JSON transactions and chosen parameters
|
| 145 |
def generate_financial_report(model, json_data, start_date, end_date, statement_type):
|
| 146 |
+
st.info(f"Preparing prompt for Gemini to generate {statement_type} report from {start_date} to {end_date}...") # Log
|
| 147 |
prompt = f"""Based on the following transactions JSON data:
|
| 148 |
{json.dumps(json_data)}
|
| 149 |
Generate a detailed {statement_type} report for the period from {start_date.strftime('%d/%m/%Y')} to {end_date.strftime('%d/%m/%Y')}. Present the report in a standard accounting format relevant to South Africa, but with improved readability and visual appeal.
|
|
|
|
| 161 |
Format the report in Markdown for better visual structure.
|
| 162 |
Do not name the company if name is not there and return just the report and nothing else."""
|
| 163 |
try:
|
| 164 |
+
st.info("Sending request to Gemini for financial report generation...") # Log
|
| 165 |
response = model.generate_content([prompt])
|
| 166 |
time.sleep(7) # Sleep for 7 seconds to work around rate limit
|
| 167 |
+
st.success("Successfully received financial report from Gemini.") # Log success
|
| 168 |
return response.text
|
| 169 |
except exceptions.ServiceUnavailable as e:
|
| 170 |
if e.response.status_code == 504:
|
|
|
|
| 172 |
st.session_state['last_error'] = "504" # Store the error in session state
|
| 173 |
return None
|
| 174 |
else:
|
| 175 |
+
st.error(f"Gemini API error during report generation: {e}") # Log other API errors
|
| 176 |
raise
|
| 177 |
+
except Exception as e:
|
| 178 |
+
st.error(f"An unexpected error occurred during Gemini report generation: {e}") # Catch other potential errors
|
| 179 |
+
return None
|
| 180 |
|
| 181 |
|
| 182 |
# Install required libraries:
|
|
|
|
| 342 |
Exception: If PDF generation fails.
|
| 343 |
"""
|
| 344 |
if not report_text:
|
| 345 |
+
st.warning("Report text is empty, skipping PDF generation.") # Log
|
| 346 |
raise ValueError("Input report_text cannot be empty.")
|
| 347 |
|
| 348 |
try:
|
| 349 |
+
st.info("Starting PDF generation from markdown report...") # Log
|
| 350 |
# 1. Clean Markdown
|
| 351 |
cleaned_md = re.sub(r'^```markdown\s*', '', report_text, flags=re.MULTILINE)
|
| 352 |
cleaned_md = re.sub(r'\s*```$', '', cleaned_md, flags=re.MULTILINE)
|
| 353 |
cleaned_md = cleaned_md.strip()
|
| 354 |
+
# st.debug("Markdown cleaned.") # Too verbose
|
| 355 |
|
| 356 |
# 2. Convert Markdown to HTML
|
| 357 |
html_content = markdown.markdown(cleaned_md, extensions=['tables', 'fenced_code', 'sane_lists'])
|
| 358 |
if not html_content:
|
| 359 |
+
st.error("Markdown parsing resulted in empty HTML.") # Log
|
| 360 |
raise ValueError("Markdown parsing resulted in empty HTML.")
|
| 361 |
+
# st.debug("Markdown converted to HTML.") # Too verbose
|
| 362 |
|
| 363 |
# 3. Parse HTML with BeautifulSoup
|
| 364 |
soup = BeautifulSoup(html_content, 'html.parser')
|
| 365 |
+
# st.debug("HTML parsed with BeautifulSoup.") # Too verbose
|
| 366 |
|
| 367 |
# 4. Generate PDF using FPDF
|
| 368 |
pdf = PDF_Generator()
|
| 369 |
pdf.add_page()
|
| 370 |
pdf.set_font('helvetica', '', 10) # Default font
|
| 371 |
+
st.info("PDF document initialized, adding content...") # Log
|
| 372 |
|
| 373 |
# Iterate through top-level tags in the HTML body
|
| 374 |
for element in soup.find_all(recursive=False):
|
|
|
|
|
|
|
| 375 |
styles = set()
|
| 376 |
def traverse(tag, current_styles):
|
|
|
|
| 377 |
local_style_added = None
|
| 378 |
if tag.name in ['b', 'strong']:
|
| 379 |
current_styles.add('b')
|
|
|
|
| 382 |
current_styles.add('i')
|
| 383 |
local_style_added = 'i'
|
| 384 |
|
|
|
|
|
|
|
| 385 |
if tag.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'table', 'br', 'hr']:
|
| 386 |
+
pdf.add_html_element(tag, current_styles.copy())
|
| 387 |
else:
|
|
|
|
| 388 |
if hasattr(tag, 'contents'):
|
| 389 |
for child in tag.contents:
|
| 390 |
+
if isinstance(child, str):
|
| 391 |
+
pass
|
| 392 |
+
elif hasattr(child, 'name'):
|
| 393 |
+
traverse(child, current_styles.copy())
|
| 394 |
|
|
|
|
| 395 |
if local_style_added and local_style_added in current_styles:
|
| 396 |
current_styles.remove(local_style_added)
|
| 397 |
|
| 398 |
traverse(element, styles)
|
| 399 |
|
| 400 |
+
st.info("Content added to PDF. Outputting PDF to buffer...") # Log
|
| 401 |
# 5. Output PDF to BytesIO buffer
|
| 402 |
pdf_output = pdf.output(dest='S') # Output as bytes string
|
| 403 |
if isinstance(pdf_output, str):
|
| 404 |
# If output is string (older fpdf versions?), encode it
|
| 405 |
pdf_output = pdf_output.encode('latin-1')
|
| 406 |
|
| 407 |
+
st.success("PDF report generated successfully.") # Log success
|
| 408 |
return BytesIO(pdf_output)
|
| 409 |
|
| 410 |
except ImportError:
|
|
|
|
| 413 |
except Exception as e:
|
| 414 |
st.error(f"Failed to generate PDF locally using FPDF: {type(e).__name__}: {e}")
|
| 415 |
st.exception(e) # Show traceback in streamlit logs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
raise Exception(f"Local FPDF PDF generation failed: {e}") from e
|
| 417 |
|
| 418 |
|
| 419 |
def main():
|
| 420 |
st.title("Quantitlytix AI")
|
| 421 |
st.markdown("*Bank Statement Parser & Financial Report Generator*")
|
| 422 |
+
st.info("Application started. Ready for user input.") # Log app start
|
| 423 |
|
| 424 |
# Initialize session state for last error
|
| 425 |
if 'last_error' not in st.session_state:
|
|
|
|
| 433 |
|
| 434 |
# Sidebar: Select input type: Bulk PDF or CSV Upload
|
| 435 |
input_type = st.sidebar.radio("Select Input Type", ("Bulk Bank Statement Upload", "CSV Upload"))
|
| 436 |
+
st.info(f"Input type selected: {input_type}") # Log input type
|
| 437 |
|
| 438 |
all_transactions = []
|
| 439 |
|
| 440 |
if input_type == "Bulk Bank Statement Upload":
|
| 441 |
uploaded_files = st.file_uploader("Upload PDF bank statements", type="pdf", accept_multiple_files=True)
|
| 442 |
if uploaded_files:
|
| 443 |
+
st.info(f"User uploaded {len(uploaded_files)} PDF file(s).") # Log file upload
|
| 444 |
total_files = len(uploaded_files)
|
| 445 |
st.write(f"{total_files} PDF file(s) uploaded.")
|
| 446 |
try:
|
|
|
|
| 452 |
|
| 453 |
file_progress = 0
|
| 454 |
for file_index, uploaded_file in enumerate(uploaded_files):
|
| 455 |
+
st.info(f"Starting processing for file {file_index+1}/{total_files}: {uploaded_file.name}") # Log individual file start
|
| 456 |
# Update file progress
|
| 457 |
file_progress = (file_index) / total_files
|
| 458 |
+
progress_bar.progress(overall_progress) # Corrected variable name
|
| 459 |
status_text.text(f"Processing file {file_index+1} of {total_files}: {uploaded_file.name}")
|
| 460 |
|
| 461 |
# Get PDF reader and page count
|
|
|
|
| 463 |
pdf_reader, total_pages = read_pdf_pages(uploaded_file)
|
| 464 |
|
| 465 |
if total_pages == 0:
|
| 466 |
+
st.warning(f"No pages found in {uploaded_file.name}. Skipping file.") # Log
|
| 467 |
continue
|
| 468 |
|
| 469 |
with st.spinner(f"Processing {uploaded_file.name} ({total_pages} pages)..."):
|
|
|
|
| 475 |
status_text.text(f"File {file_index+1}/{total_files}: {message}")
|
| 476 |
|
| 477 |
# Process the PDF page by page
|
| 478 |
+
st.info(f"Calling process_pdf_pages for {uploaded_file.name}...") # Log
|
| 479 |
file_transactions = process_pdf_pages(
|
| 480 |
model,
|
| 481 |
pdf_reader,
|
|
|
|
| 485 |
|
| 486 |
# Add transactions from this file to overall list
|
| 487 |
all_transactions.extend(file_transactions)
|
| 488 |
+
st.info(f"Finished processing {uploaded_file.name}. Extracted {len(file_transactions)} transactions.") # Log file completion
|
| 489 |
|
| 490 |
except Exception as e:
|
| 491 |
+
st.error(f"Error processing {uploaded_file.name}: {str(e)}") # Log specific file error
|
| 492 |
+
st.exception(e) # Show traceback
|
| 493 |
continue
|
| 494 |
|
| 495 |
# Complete the progress bar
|
| 496 |
progress_bar.progress(1.0)
|
| 497 |
status_text.text(f"Completed processing {total_files} files!")
|
| 498 |
+
st.success(f"All PDF files processed. Total transactions collected: {len(all_transactions)}.") # Log overall completion
|
| 499 |
|
| 500 |
except Exception as e:
|
| 501 |
+
st.error(f"Overall error during PDF document processing: {str(e)}") # Log general error during PDF handling
|
| 502 |
st.error("Please ensure you're using valid bank statement PDFs and a valid API key")
|
| 503 |
+
st.exception(e) # Show traceback
|
| 504 |
elif input_type == "CSV Upload":
|
| 505 |
uploaded_csv = st.file_uploader("Upload CSV of transactions", type="csv")
|
| 506 |
if uploaded_csv:
|
| 507 |
+
st.info(f"User uploaded CSV file: {uploaded_csv.name}.") # Log
|
| 508 |
try:
|
| 509 |
df = pd.read_csv(uploaded_csv)
|
| 510 |
# Drop 'Unnamed:' columns from the uploaded CSV
|
|
|
|
| 514 |
# Convert dataframe to list of transaction dictionaries
|
| 515 |
transactions = df.to_dict(orient='records')
|
| 516 |
all_transactions.extend(transactions)
|
| 517 |
+
st.success(f"Successfully loaded {len(transactions)} transactions from CSV.") # Log
|
| 518 |
except Exception as e:
|
| 519 |
+
st.error(f"Error processing CSV file: {str(e)}") # Log CSV error
|
| 520 |
+
st.exception(e)
|
| 521 |
|
| 522 |
# If transactions are loaded, show DataFrame and update date ranges
|
| 523 |
if all_transactions:
|
| 524 |
+
st.info("Consolidating and displaying all extracted transactions.") # Log
|
| 525 |
df = pd.DataFrame(all_transactions)
|
| 526 |
# Drop 'Unnamed:' columns from the final DataFrame
|
| 527 |
if not df.empty:
|
| 528 |
df = df.loc[:, ~df.columns.str.startswith('Unnamed:')]
|
| 529 |
try:
|
|
|
|
|
|
|
| 530 |
# Process dates and extract min/max dates for date range inputs
|
| 531 |
+
st.info("Parsing transaction dates and determining date range.") # Log
|
| 532 |
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
|
| 533 |
|
| 534 |
# Get min and max dates from transactions
|
|
|
|
| 539 |
# Update session state with actual transaction date range
|
| 540 |
st.session_state['min_date'] = min_date
|
| 541 |
st.session_state['max_date'] = max_date
|
| 542 |
+
st.info(f"Determined transaction date range: {min_date} to {max_date}.") # Log
|
| 543 |
+
else:
|
| 544 |
+
st.warning("Could not determine valid date range from transactions. Using default dates.") # Log
|
| 545 |
|
| 546 |
# Format dates for display
|
| 547 |
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')
|
| 548 |
|
| 549 |
except Exception as e:
|
| 550 |
+
st.warning("Some transaction dates could not be formatted correctly.")
|
| 551 |
st.exception(e)
|
| 552 |
|
| 553 |
st.success("Transactions loaded successfully!")
|
|
|
|
| 556 |
else:
|
| 557 |
st.warning("No valid transactions could be extracted from the documents.")
|
| 558 |
else:
|
| 559 |
+
st.info("No transactions loaded yet. Upload files to begin.") # Initial state log
|
| 560 |
|
| 561 |
# Financial report generation parameters
|
| 562 |
st.write("### Generate Financial Report")
|
|
|
|
| 571 |
statement_type = st.selectbox("Select Financial Statement", ["Income Statement", "Cashflow Statement", "Balance Sheet"])
|
| 572 |
|
| 573 |
if st.button("Generate Financial Report"):
|
| 574 |
+
st.info(f"User clicked 'Generate Financial Report' for {statement_type} from {start_date} to {end_date}.") # Log button click
|
| 575 |
if not all_transactions:
|
| 576 |
+
st.error("No transactions available to generate report. Please upload files first.") # Log
|
| 577 |
else:
|
| 578 |
# Filter transactions by date
|
| 579 |
+
st.info(f"Filtering {len(all_transactions)} transactions for the period {start_date} to {end_date}...") # Log filtering
|
| 580 |
filtered_transactions = []
|
| 581 |
for transaction in all_transactions:
|
| 582 |
try:
|
|
|
|
| 584 |
if start_date <= transaction_date <= end_date:
|
| 585 |
filtered_transactions.append(transaction)
|
| 586 |
except (ValueError, TypeError):
|
| 587 |
+
st.warning(f"Could not parse date for transaction, skipping: {transaction}") # Log problematic transactions
|
| 588 |
continue
|
| 589 |
|
| 590 |
if not filtered_transactions:
|
| 591 |
+
st.warning("No transactions found within the selected date range. Please adjust dates or upload relevant files.") # Log
|
| 592 |
else:
|
| 593 |
+
st.info(f"Found {len(filtered_transactions)} transactions within the selected date range.") # Log filtered count
|
| 594 |
try:
|
| 595 |
model1 = configure_gemini1(api_key)
|
| 596 |
combined_json = {"transactions": filtered_transactions}
|
| 597 |
with st.spinner("Generating financial report..."):
|
| 598 |
report_text = generate_financial_report(model1, combined_json, start_date, end_date, statement_type)
|
| 599 |
if report_text:
|
| 600 |
+
st.success("Financial report generated successfully by Gemini!") # Log report text ready
|
| 601 |
|
| 602 |
# Display the report as markdown
|
| 603 |
st.markdown("### Financial Report Preview")
|
|
|
|
| 605 |
|
| 606 |
# Create PDF from markdown
|
| 607 |
try:
|
| 608 |
+
st.info("Attempting to generate PDF from the report markdown.") # Log PDF start
|
| 609 |
pdf_buffer = create_pdf_report(report_text)
|
| 610 |
st.download_button(
|
| 611 |
label="Download Financial Report as PDF",
|
|
|
|
| 613 |
file_name=f"{statement_type.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d')}.pdf",
|
| 614 |
mime="application/pdf"
|
| 615 |
)
|
| 616 |
+
st.success("PDF download button enabled.") # Log
|
| 617 |
except Exception as e:
|
| 618 |
+
st.error(f"Error generating PDF for download: {str(e)}") # Log PDF error
|
| 619 |
st.info("For better PDF generation, please ensure NotoSans fonts are installed in the same directory.")
|
| 620 |
+
st.exception(e) # Show traceback
|
| 621 |
except exceptions.ServiceUnavailable as e:
|
| 622 |
if e.response.status_code == 504:
|
| 623 |
st.error("Error generating report: Gemini API timed out (504). Please try reducing the time period for the report.")
|
| 624 |
else:
|
| 625 |
+
st.error(f"Error generating financial report due to Gemini API issue: {str(e)}") # Log API error
|
| 626 |
+
st.exception(e) # Show traceback
|
| 627 |
except Exception as e:
|
| 628 |
+
st.error(f"An unexpected error occurred while generating the financial report: {str(e)}") # Log general error
|
| 629 |
if "504" in str(e):
|
| 630 |
st.info("The Gemini API might be overloaded. Consider generating reports for smaller time periods.")
|
| 631 |
elif len(filtered_transactions) > 500:
|
| 632 |
st.info("For large datasets, consider generating reports for smaller time periods.")
|
| 633 |
+
st.exception(e) # Show traceback
|
| 634 |
|
| 635 |
if __name__ == "__main__":
|
| 636 |
main()
|